diff --git a/custom_hls/sm_utils.hpp b/custom_hls/sm_utils.hpp
new file mode 100644
index 0000000000..918f8879bf
--- /dev/null
+++ b/custom_hls/sm_utils.hpp
@@ -0,0 +1,164 @@
+// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// This file is subject to the Xilinx Design License Agreement located
+// in the LICENSE.md file in the root directory of this repository.
+//
+// This file contains confidential and proprietary information of Xilinx, Inc.
+// and is protected under U.S. and international copyright and other
+// intellectual property laws.
+//
+// DISCLAIMER
+// This disclaimer is not a license and does not grant any rights to the materials
+// distributed herewith. Except as otherwise provided in a valid license issued to
+// you by Xilinx, and to the maximum extent permitted by applicable law: (1) THESE
+// MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX HEREBY
+// DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
+// INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR
+// FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
+// in contract or tort, including negligence, or under any other theory of
+// liability) for any loss or damage of any kind or nature related to, arising
+// under or in connection with these materials, including for any direct, or any
+// indirect, special, incidental, or consequential loss or damage (including loss
+// of data, profits, goodwill, or any type of loss or damage suffered as a result
+// of any action brought by a third party) even if such damage or loss was
+// reasonably foreseeable or Xilinx had been advised of the possibility of the
+// same.
+//
+// CRITICAL APPLICATIONS
+// Xilinx products are not designed or intended to be fail-safe, or for use in
+// any application requiring failsafe performance, such as life-support or safety
+// devices or systems, Class III medical devices, nuclear facilities, applications
+// related to the deployment of airbags, or any other applications that could lead
+// to death, personal injury, or severe property or environmental damage
+// (individually and collectively, "Critical Applications"). Customer assumes the
+// sole risk and liability of any use of Xilinx products in Critical Applications,
+// subject only to applicable laws and regulations governing limitations on product
+// liability.
+//
+// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT ALL TIMES.
+#ifndef SM_UTIL_HPP
+#define SM_UTIL_HPP
+#include "hls_vector.h"
+
+//- Compile-Time Functions --------------------------------------------------
+
+// ceil(log2(x))
+template<typename T>
+constexpr unsigned clog2(T  x) {
+  return  x<2? 0 : 1+clog2((x+1)/2);
+}
+
+//- Streaming Flit with `last` Marking --------------------------------------
+template<typename T>
+struct flit_t {
+	bool  last;
+	T     data;
+
+public:
+	flit_t(bool  last_, T const &data_) : last(last_), data(data_) {}
+	~flit_t() {}
+};
+
+//- Streaming Copy ----------------------------------------------------------
+template<typename T>
+void move(hls::stream<T> &src, hls::stream<T> &dst) {
+#pragma HLS pipeline II=1 style=flp
+	if(!src.empty())  dst.write(src.read());
+}
+
+//- Tree Reduce -------------------------------------------------------------
+template< unsigned long  N, typename  TA, typename  TR = TA, typename  F >
+TR tree_reduce(hls::stream<TA> &v, F f) {
+#pragma HLS inline
+#pragma HLS function_instantiate variable=f
+        TR  tree[2*N-1];
+#pragma HLS array_partition complete dim=1 variable=tree
+        for(unsigned  i = N; i-- > 0;) {
+#pragma HLS unroll
+                tree[N-1 + i] = v.read();
+        }
+        for(unsigned  i = N-1; i-- > 0;) {
+#pragma HLS unroll
+                tree[i] = f(tree[2*i+1], tree[2*i+2]);
+        }
+        return  tree[0];
+}
+
+// Recursive comparison and count (of max)
+// Builds a tree to compute the max of a vector
+template<unsigned N, typename T>
+struct MaxReduction {
+
+    static T max(const hls::vector<T, N>& input) {
+#pragma HLS INLINE
+        constexpr unsigned M = (N + 1) / 2;
+        hls::vector<T, M> res;
+
+        for(unsigned i = 0; i < M; ++i) {
+#pragma HLS unroll
+            if (2*i + 1 < N)
+                res[i] = input[2*i] > input[2*i + 1] ? input[2*i] : input[2*i + 1];
+            else
+                res[i] = input[2*i]; // Handle the case where the input size is odd
+        }
+
+        return MaxReduction<M, T>::max(res);
+    }
+
+};
+
+template<typename T>
+struct MaxReduction<2, T> {
+    static T max(const hls::vector<T, 2>& input) {
+#pragma HLS INLINE
+        return (input[0] > input[1]) ? input[0] : input[1];
+    }
+};
+
+template<typename T>
+struct MaxReduction<1, T> {
+    static T max(const hls::vector<T, 1>& input) {
+#pragma HLS INLINE
+        return input[0];
+    }
+};
+
+// Recursive reduction tree for the total summation
+// Code for the Nth stage
+template<unsigned N>
+struct TreeReduction {
+    static float reduce(const hls::vector<float, N>& input) {
+#pragma HLS INLINE
+        constexpr unsigned M = (N + 1) / 2;
+        hls::vector<float, M> sum;
+
+        for(unsigned i = 0; i < M; ++i) {
+#pragma HLS unroll
+            if (2*i + 1 < N)
+                sum[i] = input[2*i] + input[2*i + 1];
+            else
+                sum[i] = input[2*i]; // Handle the case where the input size is odd
+        }
+
+        return TreeReduction<M>::reduce(sum);
+    }
+};
+
+template<>
+struct TreeReduction<2> {
+    static float reduce(const hls::vector<float, 2>& input) {
+#pragma HLS INLINE
+        return input[0] + input[1];
+    }
+};
+
+template<>
+struct TreeReduction<1> {
+    static float reduce(const hls::vector<float, 1>& input) {
+#pragma HLS INLINE
+        return input[0];
+    }
+};
+
+
+#endif
\ No newline at end of file
diff --git a/custom_hls/softmax.hpp b/custom_hls/softmax.hpp
new file mode 100644
index 0000000000..9452045a77
--- /dev/null
+++ b/custom_hls/softmax.hpp
@@ -0,0 +1,285 @@
+// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+//
+// This file is subject to the Xilinx Design License Agreement located
+// in the LICENSE.md file in the root directory of this repository.
+//
+// This file contains confidential and proprietary information of Xilinx, Inc.
+// and is protected under U.S. and international copyright and other
+// intellectual property laws.
+//
+// DISCLAIMER
+// This disclaimer is not a license and does not grant any rights to the materials
+// distributed herewith. Except as otherwise provided in a valid license issued to
+// you by Xilinx, and to the maximum extent permitted by applicable law: (1) THESE
+// MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX HEREBY
+// DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
+// INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR
+// FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
+// in contract or tort, including negligence, or under any other theory of
+// liability) for any loss or damage of any kind or nature related to, arising
+// under or in connection with these materials, including for any direct, or any
+// indirect, special, incidental, or consequential loss or damage (including loss
+// of data, profits, goodwill, or any type of loss or damage suffered as a result
+// of any action brought by a third party) even if such damage or loss was
+// reasonably foreseeable or Xilinx had been advised of the possibility of the
+// same.
+//
+// CRITICAL APPLICATIONS
+// Xilinx products are not designed or intended to be fail-safe, or for use in
+// any application requiring failsafe performance, such as life-support or safety
+// devices or systems, Class III medical devices, nuclear facilities, applications
+// related to the deployment of airbags, or any other applications that could lead
+// to death, personal injury, or severe property or environmental damage
+// (individually and collectively, "Critical Applications"). Customer assumes the
+// sole risk and liability of any use of Xilinx products in Critical Applications,
+// subject only to applicable laws and regulations governing limitations on product
+// liability.
+//
+// THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT ALL TIMES.
+
+#include <ap_int.h>
+#include <hls_stream.h>
+#include <hls_vector.h>
+#include <hls_math.h>
+#include <functional>
+#include <cmath>
+#include <climits>
+#include <type_traits>
+#include "sm_utils.hpp"
+
+// First stage of the pipeline:
+//
+// Trigger: When a vector of SIMD elements is present in the stream
+//
+// Desc: Pass over the input N items and calc the max value
+template<unsigned N, unsigned SIMD, typename T>
+void max_calc_stage(
+	hls::stream<hls::vector<T, SIMD>> &ins, 
+	hls::stream<hls::vector<T,SIMD>> &outs,
+	hls::stream<T> &maxs
+) {
+#pragma HLS pipeline II=1 style=flp
+	static ap_uint<clog2(N/SIMD)> count = 0;
+	static T max = 0;
+#pragma HLS reset variable=count
+#pragma HLS reset variable=max
+
+	if(!ins.empty()){
+		hls::vector<T,SIMD> out;
+		hls::vector<T,SIMD+1> max_v;
+		hls::vector<T,SIMD> const in = ins.read();
+
+
+		for(unsigned i=0; i<SIMD; i++){
+#pragma HLS UNROLL 
+			out[i] = in[i]; 
+			max_v[i] = in[i];
+		}
+		outs.write(out);
+
+		max_v[SIMD] = max;
+		max = MaxReduction<SIMD+1, T>::max(max_v);
+
+		count++;
+		if (count == (N/SIMD)-1) {
+			count = 0;
+			maxs.write(max);
+			max = 0;
+		}
+	}
+}
+
+
+// Second stage of the pipeline
+//
+// Trigger: When a max value is sent from the preceeding stage 
+//
+// Desc: For each item in a N item sequence calc the (exp - max) in float
+// track the sum while processing the N items.
+template<unsigned N, unsigned SIMD, typename T>
+void exp_sum_calc(
+	hls::stream<hls::vector<T, SIMD>> &ins, 
+	hls::stream<T> &maxs, 
+	hls::stream<hls::vector<float, SIMD>> &outs,
+	hls::stream<float> &sums
+){
+#pragma HLS pipeline II=1 style=flp
+	static ap_uint<clog2(N/SIMD)+1> count = 0;
+	static float sum = 0.0f;
+	static bool valid = false;
+	static float max = 0.0f;
+#pragma HLS reset variable=count
+#pragma HLS reset variable=sum
+#pragma HLS reset variable=valid
+#pragma HLS reset variable=max
+
+	if (count == (N/SIMD)) {
+		count = 0;
+		valid = false;
+		sums.write(sum);
+		sum = 0.0f;
+		return;
+	}
+
+	if(valid && !ins.empty()) {
+		hls::vector<T, SIMD> const in = ins.read();
+		hls::vector<float, SIMD> out;
+		for (unsigned i=0; i<SIMD; i++) {
+#pragma HLS UNROLL
+			out[i] = hls::exp(float(in[i]) - max); 	
+		}
+		sum += TreeReduction<SIMD>::reduce(out); 
+		outs.write(out);
+		
+		count++;
+	}
+
+	if (!maxs.empty() && !valid) {
+		max = maxs.read();
+		valid = true;
+	}
+
+}
+
+// Third stage of the pipeline
+//
+// Trigger: When a sum value is sent from the preceeding stage 
+// 
+// Desc: For the N items take the input and divide it by the sum 
+template<unsigned N, unsigned SIMD>
+void div_calc(
+	hls::stream<hls::vector<float, SIMD>> &ins, 
+	hls::stream<float> &sums,
+	hls::stream<hls::vector<float, SIMD>> &outs
+){
+#pragma HLS pipeline II=1 style=flp
+	static ap_uint<clog2(N/SIMD)+1> count = 0;
+	static bool valid = false;
+	static float sum = 0.0f;
+#pragma HLS reset variable=count
+#pragma HLS reset variable=valid
+#pragma HLS reset variable=sum
+
+	if (count == (N/SIMD)) {
+		count = 0;
+		valid = false;
+		return;
+	}
+
+	if (valid && !ins.empty()) {
+		hls::vector<float, SIMD> const in = ins.read();
+		hls::vector<float, SIMD> out;
+		for(unsigned i=0; i<SIMD; i++) {
+#pragma HLS unroll
+			out[i] = in[i] / sum;
+		}
+
+		outs.write(out);
+
+		count++;
+	}
+
+	if(!sums.empty() && !valid ){
+		valid = true;
+		sum = sums.read();
+	}
+}
+
+
+template<unsigned N, unsigned SIMD, typename T>
+void smax(
+    hls::stream<hls::vector<T, SIMD>> &src,
+    hls::stream<hls::vector<float, SIMD>> &dst
+) {
+#pragma HLS dataflow disable_start_propagation 
+    static_assert(N%SIMD == 0, "N must be a multiple of SIMD");
+
+    static hls::stream<hls::vector<T,SIMD>> max_data_s;
+#pragma HLS stream variable=max_data_s depth=N
+    static hls::stream<T> max_s;
+#pragma HLS stream variable=max_s depth=2
+
+    static hls::stream<hls::vector<float,SIMD>> exp_data_s;
+#pragma HLS stream variable=exp_data_s depth=N
+    static hls::stream<float> sum_s;
+#pragma HLS stream variable=sum_s depth=2
+
+    max_calc_stage<N, SIMD, T>(src, max_data_s, max_s);
+    exp_sum_calc<N, SIMD, T>(max_data_s, max_s, exp_data_s, sum_s);
+    div_calc<N,SIMD>(exp_data_s, sum_s, dst);
+
+} // smax()
+
+// Threshold/quantisation at the output of the softmax 
+template<
+        typename T, // The quantised output type (Needs to be signed)
+        typename TF // The float based input type
+>
+T quant_threshold(TF val) {
+#pragma HLS INLINE
+        constexpr unsigned numBits = sizeof(T)*CHAR_BIT;
+        if(val>=1.0f){
+                T frac_val = ~T(0);
+                if(std::is_signed<T>::value) {
+                        return frac_val;
+                } else {
+                        T mask = ~(T(1) << (numBits - 1));
+                        return frac_val & mask;
+                }
+        }
+
+
+        ap_fixed<numBits-1, 0> fixed_point_val = val;
+        T frac_val = fixed_point_val.range(numBits - 2, 0);
+        return frac_val;
+}
+
+
+
+// Quantisation pipeline stage
+//
+// Trigger: When a SIMD vector is received from the preceeding stage 
+// 
+// Desc: Apply quantisation to the SIMD elements and write them into the
+// SIMD width output stream.
+template<
+	unsigned N,
+	unsigned SIMD,
+	typename T
+>
+void quant_stage(
+		hls::stream<hls::vector<float,SIMD>> &in,
+		hls::stream<hls::vector<T, SIMD>> &out
+) {
+#pragma HLS pipeline II=1 style=flp
+	if(!in.empty()) {
+		hls::vector<float, SIMD> const x = in.read();
+		hls::vector<T,SIMD> y;
+		for(unsigned i=0; i<SIMD; i++) {
+#pragma HLS UNROLL
+			y[i] = quant_threshold<T>(x[i]);
+		}
+		out.write(y);
+	}
+}
+
+// Quantised version of softmax
+// This is the same as the float softmax with an additional baked in quantisation stage at the end
+template<
+	 unsigned N, // The width of the input dimension 
+	 unsigned SIMD, // Amount of parallelism (how many items consumed/produced at a time 
+	 typename T  
+	 >
+void smaxquant(
+	hls::stream<hls::vector<T,SIMD>> &src,
+	hls::stream<hls::vector<T,SIMD>> &dst
+) {
+#pragma HLS DATAFLOW disable_start_propagation
+	hls::stream<hls::vector<float,SIMD>> smax_out;
+#pragma HLS stream variable=smax_out depth=2
+	static_assert(N%SIMD == 0, "SIMD must be a factor of N"); 
+
+	smax<N,SIMD,T>(src, smax_out);
+	quant_stage<N,SIMD,T>(smax_out, dst);
+
+} // smaxquant()
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index c7500bcaa6..4004523bad 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -81,14 +81,14 @@ if [ -f "$VITIS_PATH/settings64.sh" ];then
   export XILINX_XRT=/opt/xilinx/xrt
   source $VITIS_PATH/settings64.sh
   gecho "Found Vitis at $VITIS_PATH"
-  if [ -f "$XILINX_XRT/setup.sh" ];then
-    # source XRT
-    source $XILINX_XRT/setup.sh
-    gecho "Found XRT at $XILINX_XRT"
-  else
-    recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
-    exit -1
-  fi
+  # if [ -f "$XILINX_XRT/setup.sh" ];then
+  #   # source XRT
+  #   source $XILINX_XRT/setup.sh
+  #   gecho "Found XRT at $XILINX_XRT"
+  # else
+  #   recho "XRT not found on $XILINX_XRT, did you skip the download or did the installation fail?"
+  #   exit -1
+  # fi
 else
   yecho "Unable to find $VITIS_PATH/settings64.sh"
   yecho "Functionality dependent on Vitis will not be available."
@@ -137,6 +137,15 @@ else
   echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts"
 fi
 
+# add hls library path to LD_LIBRARY_PATH
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fpo_v7_1"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fft_v9_1"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fir_v7_0"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/dds_v6_0"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/tps/lnx64/gcc-8.3.0/lib"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lib/lnx64.o/Rhel"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/lib/csim"
+
 export PATH=$PATH:$HOME/.local/bin
 # execute the provided command(s) as root
 exec "$@"
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 2033973f2a..847952e399 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -115,10 +115,10 @@ fetch_board_files() {
     cd $OLD_PWD
 }
 
-fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR
-fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR
-fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR
-fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR
+# fetch_repo $QONNX_URL $QONNX_COMMIT $QONNX_DIR
+# fetch_repo $FINN_EXP_URL $FINN_EXP_COMMIT $FINN_EXP_DIR
+# fetch_repo $BREVITAS_URL $BREVITAS_COMMIT $BREVITAS_DIR
+# fetch_repo $PYVERILATOR_URL $PYVERILATOR_COMMIT $PYVERILATOR_DIR
 fetch_repo $CNPY_URL $CNPY_COMMIT $CNPY_DIR
 fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
 fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR
diff --git a/run-docker.sh b/run-docker.sh
index 88fabff2fa..b1fe44eb0c 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -265,6 +265,36 @@ if [ ! -z "$FINN_XILINX_PATH" ];then
     DOCKER_EXEC+="-e ALVEO_TARGET_DIR=$ALVEO_TARGET_DIR "
   fi
 fi
+
+# This part is used for internal ci for finn-examples
+# if using build verification for finn-examples ci, set up the necessary Docker variables
+if [ "$VERIFICATION_EN" = 1 ]; then
+  if [ -z "$FINN_EXAMPLES_ROOT" ]; then
+    recho "FINN_EXAMPLES_ROOT path has not been set."
+    recho "Please set FINN_EXAMPLES_ROOT path to enable verification."
+    exit -1
+  elif [ ! -d "${FINN_EXAMPLES_ROOT}/ci" ]; then
+    recho "ci folder not found in ${FINN_EXAMPLES_ROOT}."
+    recho "Please ensure the FINN-examples repo has been set up correctly, and FINN_EXAMPLES_ROOT path is set correctly, to enable verification."
+    exit -1
+  elif [ -z "$VERIFICATION_IO" ]; then
+    recho "VERIFICATION_IO paths has not been set."
+    recho "Please ensure the path to the input and expected output files has been set correctly to eneable verification."
+    exit -1
+  elif [ ! -d "$VERIFICATION_IO" ]; then
+    recho "${VERIFICATION_IO} is not a directory."
+    recho "Please ensure the VERIFICATION_IO path has been set to the directory containing the input and expected output files for verification."
+    exit -1
+  else
+    DOCKER_EXEC+="-e VERIFICATION_EN=$VERIFICATION_EN "
+    DOCKER_EXEC+="-e FINN_EXAMPLES_ROOT=$FINN_EXAMPLES_ROOT "
+    DOCKER_EXEC+="-e VERIFICATION_IO=$VERIFICATION_IO "
+    FINN_DOCKER_EXTRA+="-v $FINN_EXAMPLES_ROOT/ci:$FINN_EXAMPLES_ROOT/ci "
+    FINN_DOCKER_EXTRA+="-v $VERIFICATION_IO:$VERIFICATION_IO "
+  fi
+fi
+
+
 DOCKER_EXEC+="$FINN_DOCKER_EXTRA "
 
 if [ -z "$FINN_SINGULARITY" ];then
diff --git a/setup.cfg b/setup.cfg
index 4834011dea..511ce451dd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -32,6 +32,7 @@
 
 [metadata]
 name = finn
+version = 0.9.0
 description = A Framework for Fast, Scalable Quantized Neural Network Inference
 author = Yaman Umuroglu
 author_email = yamanu@xilinx.com
@@ -60,6 +61,60 @@ package_dir =
 # tests_require = pytest; pytest-cov
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4
 # python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
+install_requires =
+    qonnx @ git+https://github.com/fastmachinelearning/qonnx.git@fd61cfeebbdaba351abf7e9d54cd785d7776fa4f
+    pyverilator @ git+https://github.com/maltanar/pyverilator.git@766e457465f5c0dd315490d7b9cc5d74f9a76f4f
+    brevitas @ git+https://github.com/Xilinx/brevitas.git@84f42259ec869eb151af4cb8a8b23ad925f493db
+    finn-experimental @ git+https://github.com/Xilinx/finn-experimental.git@de99347e936d51715f5356a1b6c64e37b91c23c2
+    dataset_loading @ git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
+    bitstring==3.1.7
+    clize==5.0.1
+    dataclasses-json==0.5.7
+    gspread==3.6.0
+    importlib-resources==6.1.0
+    ipython==8.12.2
+    numpy==1.24.1
+    onnx==1.13.0
+    onnxoptimizer
+    onnxruntime==1.16.1
+    pre-commit==3.3.2
+    protobuf==3.20.3
+    psutil==5.9.4
+    pyscaffold==4.4
+    scipy==1.10.1
+    setupext-janitor>=1.1.2
+    setuptools==68.2.2
+    sigtools==4.0.1
+    toposort==1.7.0
+    vcdvcd==1.0.5
+    wget==3.2
+    torch==1.13.1
+    torchvision==0.14.1
+    torchaudio==0.13.1
+    pygments==2.14.0
+    ipykernel==6.21.2
+    jupyter==1.0.0
+    markupsafe==2.0.1
+    matplotlib==3.7.0
+    pytest-dependency==0.5.1
+    pytest-xdist[setproctitle]==3.2.0
+    pytest-parallel==0.1.1
+    netron>=5.0.0
+    pandas==1.5.3
+    scikit-learn==1.2.1
+    tqdm==4.64.1
+    pytest==6.2.5
+    pytest-metadata==1.7.0
+    pytest-html==3.0.0
+    pytest-html-merger==0.0.8
+    pytest-cov==4.1.0
+    deap==1.3.1
+    mip==1.13.0
+    networkx==2.8
+    future-annotations==1.0.0
+    dependencies==2.0.1
+    tokenize-rt==4.2.1
+    tclwrapper==0.0.1
 
 [options.packages.find]
 where = src
@@ -164,14 +219,3 @@ exclude =
     dist
     .eggs
     docs/conf.py
-
-[pyscaffold]
-# PyScaffold's parameters when the project was created.
-# This will be used when updating. Do not change!
-version = 3.2.1
-package = finn
-extensions =
-    travis
-    pre_commit
-    namespace
-namespace = finn
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..9bcbb1e860 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -54,6 +54,8 @@
 from finn.custom_op.fpgadataflow.thresholding import Thresholding
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
 from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
+from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
+
 
 custom_op = dict()
 
@@ -81,3 +83,4 @@
 custom_op["StreamingEltwise"] = StreamingEltwise
 custom_op["StreamingMaxPool"] = StreamingMaxPool
 custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour
+custom_op["QuantSoftmax"] = QuantSoftmax
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..8f5a0a7cc7 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -52,6 +52,7 @@
 from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls
 from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
 from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls
+from finn.custom_op.fpgadataflow.hls.quantsoftmax_hls import QuantSoftmax_hls
 
 custom_op = dict()
 
@@ -79,3 +80,4 @@
 custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
 custom_op["MVAU_hls"] = MVAU_hls
 custom_op["VVAU_hls"] = VVAU_hls
+custom_op["QuantSoftmax_hls"] = QuantSoftmax_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
new file mode 100644
index 0000000000..19903866b3
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/quantsoftmax_hls.py
@@ -0,0 +1,184 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import numpy as np
+from finn.custom_op.fpgadataflow.quantsoftmax import QuantSoftmax
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow import templates
+from finn.util.basic import CppBuilder
+class QuantSoftmax_hls(QuantSoftmax, HLSBackend):
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(QuantSoftmax.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = [
+            '#include <hls_vector.h>',
+            '#include "softmax.hpp"',
+            '#include "utils.hpp"',
+            ]
+
+    def defines(self, var):
+        simd = self.get_nodeattr("simd")
+        dtype = self.get_input_datatype()
+        channels = self.get_nodeattr("channels")
+        self.code_gen_dict["$DEFINES$"] = [
+           f"""
+            constexpr unsigned  SIMD = {simd};
+            constexpr unsigned  W = {channels};
+            using  T = {dtype.get_hls_datatype_str()};
+            using  F = float;
+           """
+        ]
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            f'''
+                static hls::stream<hls::vector<T,SIMD>>  src0;
+                static hls::stream<hls::vector<T,SIMD>>  dst0;
+
+                move(in0_{self.hls_sname()}, src0);
+                smaxquant<W,SIMD,T>(src0, dst0);
+                move(dst0, out_{self.hls_sname()});
+        '''
+        ]
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"]  = [
+            f'''
+            void {self.onnx_node.name}(
+                hls::stream<hls::vector<T,SIMD>> &in0_{self.hls_sname()},
+                hls::stream<hls::vector<T,SIMD>> &out_{self.hls_sname()}
+                )
+            '''
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"]  = [
+            f'''
+            #pragma HLS interface AXIS port=in0_{self.hls_sname()}
+            #pragma HLS interface AXIS port=out_{self.hls_sname()}
+            #pragma HLS aggregate  variable=in0_{self.hls_sname()} compact=bit
+            #pragma HLS aggregate  variable=out_{self.hls_sname()} compact=bit
+
+            #pragma HLS interface ap_ctrl_none port=return
+            #pragma HLS dataflow disable_start_propagation
+            '''
+        ]
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        folded_ishape = self.get_folded_input_shape()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+            inp = context[node.input[0]]
+            export_idt = self.get_input_datatype()
+            inp = inp.reshape(folded_ishape)
+            np.save(os.path.join(code_gen_dir, "input_0.npy"), inp)
+            # # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # # load output npy file
+            super().npy_to_dynamic_output(context)
+        else:
+            raise Exception(f"Unsupported execution mode: {mode}")
+
+    def compile_singlenode_code(self):
+        """Builds the bash script for compilation using the CppBuilder from
+        finn.util.basic and executes the script to produce the executable."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        builder = CppBuilder()
+        # to enable additional debug features please uncommand the next line
+        # builder.append_includes("-DDEBUG")
+        builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp")
+        builder.append_includes("-I$FINN_ROOT/deps/cnpy/")
+        builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib")
+        builder.append_includes("-I$FINN_ROOT/custom_hls")
+        builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"]))
+        builder.append_includes("--std=c++14")
+        builder.append_includes("-O3")
+        builder.append_sources(code_gen_dir + "/*.cpp")
+        builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp")
+        builder.append_includes("-lz")
+        builder.append_includes("-fno-builtin -fno-inline -Wl,-rpath,\"$HLS_PATH/lnx64/lib/csim\" -L$HLS_PATH/lnx64/lib/csim -lhlsmc++-GCC46")
+        builder.append_includes("-L$HLS_PATH/lnx64/tools/fpo_v7_1 -lgmp -lmpfr -lIp_floating_point_v7_1_bitacc_cmodel")
+        builder.set_executable_path(code_gen_dir + "/node_model")
+        builder.build(code_gen_dir)
+        self.set_nodeattr("executable_path", builder.executable_path)
+
+    def code_generation_cppsim(self, model):
+        """Generates c++ code for simulation (cppsim)."""
+        self.code_gen_dict["$READNPYDATA$"] = [""]
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [""]
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = [""]
+        node = self.onnx_node
+        path = self.get_nodeattr("code_gen_dir_cppsim")
+        self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+        self.generate_params(model, path)
+        self.global_includes()
+        self.defines("cppsim")
+        self.pragmas()
+        oshape = self.get_folded_output_shape()
+        oshape_str = str(oshape).replace("(", "{").replace(")", "}")
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            f'''
+            static hls::stream<hls::vector<T,SIMD>>  in0_V;
+            static hls::stream<hls::vector<T,SIMD>>  out_V;
+
+            npy2vectorstream<T, float, SIMD>("{path}/input_0.npy", in0_V);
+            int stream_size = in0_V.size();
+
+            while(out_V.size() != stream_size){{
+                smaxquant<W, SIMD, T>(in0_V, out_V);
+            }}
+
+            vectorstream2npy<T, float, SIMD>(out_V,{oshape_str}, "{path}/output.npy");
+            '''
+        ]
+        self.save_as_npy()
+
+        template = templates.docompute_template
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + f"/execute_{node.op_type}.cpp"
+        with open(code_gen_dir, "w") as f:
+            for key in self.code_gen_dict:
+                # transform list into long string separated by '\n'
+                code_gen_line = "\n".join(self.code_gen_dict[key])
+                template = template.replace(key, code_gen_line)
+            f.write(template)
+
+    def prepare_rtlsim(self):
+        # this node currently does not support rtlsim
+        raise NotImplementedError("QuantSoftmax_hls does not support rtlsim")
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..5436aa31af 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -307,16 +307,21 @@ def npy_to_dynamic_outputs(self, context, npy_list):
 
     def exec_precompiled_singlenode_model(self):
         """Executes precompiled executable."""
-        executable_path = self.get_nodeattr("executable_path")
-        if executable_path == "":
+        executable = self.get_nodeattr("executable_path")
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        if executable == "":
             raise Exception(
                 """
 Found no executable for this node, did you run the codegen and
 compilation transformations?
             """
             )
-        process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE)
-        process_execute.communicate()
+        with open(code_gen_dir + "/sim.log", "w") as f:
+            try:
+                subprocess.check_output(executable, stderr=f)
+            except subprocess.CalledProcessError:
+                raise Exception(f"Error running the generated code. Check {f.name} for more details.")
+
 
     def hls_sname(self):
         """Get the naming convention used by Vitis HLS for stream signals
diff --git a/src/finn/custom_op/fpgadataflow/quantsoftmax.py b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
new file mode 100644
index 0000000000..ac9c17fb63
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/quantsoftmax.py
@@ -0,0 +1,120 @@
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from onnx.helper import make_node
+import warnings
+from qonnx.core.datatype import DataType
+from onnx.helper import make_node
+import numpy as np
+from scipy.special import softmax
+class QuantSoftmax(HWCustomOp):
+    """Abstraction layer for HW implementation of VectorVectorActivation layers."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "ifm_dim": ("ints", True, []),
+            "simd": ("i", False, 1),
+            "channels": ("i", True, 0),
+            # FINN DataTypes for inputs, weights, outputs
+            "data_type": ("s", True, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        h, w = self.get_nodeattr("ifm_dim")
+        c = self.get_nodeattr("channels")
+        return (1, h, w, c)
+
+    def get_normal_output_shape(self, ind=0):
+        return self.get_normal_input_shape()
+
+    def get_number_output_values(self):
+        raise NotImplementedError("This function is not yet implemented.")
+
+    def quantise_to_int(self, arr, dtype):
+        max_val = np.iinfo(dtype).max
+        output = np.zeros_like(arr, dtype=dtype)
+        frac_part = arr - np.floor(arr)
+        scaled_frac = frac_part * max_val
+        output = scaled_frac.astype(dtype)
+        output[arr >= 1.0] = max_val
+        return output
+
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        input_data = context[node.input[0]]
+        output_data = softmax(input_data, axis=-1)
+        qsm_out = self.quantise_to_int(output_data, np.int8)
+        context[node.output[0]] = qsm_out
+
+
+    def get_number_output_values(self):
+        raise NotImplementedError
+
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        data_type = DataType[self.get_nodeattr("data_type")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert data_type.allowed(0), "DataType must support zero"
+        return data_type
+
+    def make_shape_compatible_op(self, model):
+        shape = self.get_normal_input_shape()
+        # create an ONNX Softmax node with the same shape as this one
+        return make_node("Softmax",
+                         inputs=[self.onnx_node.input[0]],
+                         outputs=[self.onnx_node.output[0]],
+                         shape=list(shape)
+                         )
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "data_type changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("data_type", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        raise NotImplementedError
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("simd")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("simd")
+        return obits * simd
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output. (Same as input datatype)"""
+        return self.get_input_datatype()
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = list(self.get_normal_output_shape())
+        ifm_ch = self.get_nodeattr("channels")
+        simd = self.get_nodeattr("simd")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        ifm_ch = self.get_nodeattr("channels")
+        simd = self.get_nodeattr("simd")
+        assert ifm_ch % simd == 0, "SIMD must divide input channels"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
index c31f90af0b..d1e9387b1b 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py
@@ -186,9 +186,23 @@ def prepare_codegen_rtl_values(self, model):
         n_thres_steps = self.get_nodeattr("numSteps")
         wdt = self.get_weight_datatype()
         if expected_thresholds != n_thres_steps:
-            min_val = wdt.min()
-            thresholds = np.insert(thresholds, 0, min_val, axis=1)
-            bias = bias - 1
+            if DataType[output_data_type].signed():
+                min_val = wdt.min()
+                thresholds = np.insert(thresholds, 0, min_val, axis=1)
+                bias = bias - 1
+            # TODO: temporary fix for unsigned narrow quantization
+            else:
+                max_val = wdt.max()
+                if max_val > DataType[input_data_type].max():
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+                else:
+                    max_val = max_val + 1
+                    # increase wdt
+                    if not wdt.signed():
+                        wdt = DataType.get_smallest_possible(max_val)
+                    else:
+                        wdt = DataType.get_smallest_possible(-max_val - 1)
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
             n_thres_steps += 1
 
         # add dummy dimension as final dimension (that's what gets packed with next call)
@@ -528,8 +542,22 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         n_thres_steps = self.get_nodeattr("numSteps")
         wdt = self.get_weight_datatype()
         if expected_thresholds != n_thres_steps:
-            min_val = wdt.min()
-            thresholds = np.insert(thresholds, 0, min_val, axis=1)
+            if DataType[output_data_type].signed():
+                min_val = wdt.min()
+                thresholds = np.insert(thresholds, 0, min_val, axis=1)
+            # TODO: temporary fix for unsigned narrow quantization
+            else:
+                max_val = wdt.max()
+                if max_val > self.get_input_datatype().max():
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
+                else:
+                    max_val = max_val + 1
+                    # increase wdt
+                    if not wdt.signed():
+                        wdt = DataType.get_smallest_possible(max_val)
+                    else:
+                        wdt = DataType.get_smallest_possible(-max_val - 1)
+                    thresholds = np.insert(thresholds, len(thresholds[0]), max_val, axis=1)
             n_thres_steps += 1
 
         # If a single threshold value is found, broadcast the value
@@ -541,7 +569,6 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
         thresh_padded = np.zeros((thresholds.shape[0], width_padded))
         thresh_padded[: thresholds.shape[0], :n_thres_steps] = thresholds
         thresh_stream = []
-        wdt = self.get_weight_datatype()
         bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32)
         padding = np.zeros(width_padded, dtype=np.int32)
 
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..8c9e99a578 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 
@@ -42,18 +43,22 @@
 $DEFINES$
 
 int main(){
-$PRAGMAS$
 
-$STREAMDECLARATIONS$
+    $PRAGMAS$
 
-$READNPYDATA$
+    try {
+    $STREAMDECLARATIONS$
 
-$DOCOMPUTE$
+    $READNPYDATA$
 
-$DATAOUTSTREAM$
+    $DOCOMPUTE$
 
-$SAVEASCNPY$
+    $DATAOUTSTREAM$
 
+    $SAVEASCNPY$
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+    }
 }
 
 """
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index e14181b140..e400e4335f 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1697,3 +1697,55 @@ def apply(self, model):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferQuantSoftmax(Transformation):
+    '''
+    Find softmax layers that are followed by a MultiThreshold layer and replace them with QuantizedSoftmax
+    '''
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            # check that an optype of Softmax is present followed by a MultiThreshold
+            consumer = model.find_consumer(n.output[0])
+            if n.op_type == "Softmax" and consumer is not None and consumer.op_type == "MultiThreshold":
+                # get the shape of the input/output tensor
+                input_shape = model.get_tensor_shape(n.input[0])
+                assert input_shape == model.get_tensor_shape(consumer.input[0]), (
+                    "Softmax and MultiThreshold input shapes do not match"
+                )
+                h = int(input_shape[1])
+                w = int(input_shape[2])
+                c = int(input_shape[3])
+                idt0 = model.get_tensor_datatype(n.input[0])
+                # create node with no parallelization first
+                simd = 1
+                # create and insert new node
+                new_node = helper.make_node(
+                    "QuantSoftmax",
+                    [n.input[0]],  # input tensor(s)
+                    [consumer.output[0]],  # output tensor(s)
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    ifm_dim=[h, w],
+                    channels=c,
+                    data_type = idt0.name,
+                    name="Quant"+n.name,
+                    simd=simd
+                )
+                graph.node.insert(node_ind, new_node)
+                graph.node.remove(n)
+                # remove multithreshold too
+                graph.node.remove(consumer)
+                graph_modified = True
+                
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 91c191962f..c2e2cbcd8a 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -192,8 +192,12 @@ def build(self, code_gen_dir):
             f.write("#!/bin/bash \n")
             f.write(bash_compile + "\n")
         bash_command = ["bash", self.compile_script]
-        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-        process_compile.communicate()
+
+        with open(str(self.code_gen_dir) + "/compile.log", "w") as f:
+            try:
+                subprocess.check_output(bash_command, stderr=f)
+            except subprocess.CalledProcessError:
+                raise Exception(f"Error in compiling the generated code. Check {f.name} for more details.")
 
 
 def launch_process_helper(args, proc_env=None, cwd=None):
diff --git a/tests/fpgadataflow/test_fpgadataflow_softmax.py b/tests/fpgadataflow/test_fpgadataflow_softmax.py
new file mode 100644
index 0000000000..c813bc3ff9
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_softmax.py
@@ -0,0 +1,242 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+import torch
+from onnx import helper
+import finn.core.onnx_exec as oxe
+from brevitas.export import export_qonnx
+from qonnx.util.cleanup import cleanup as qonnx_cleanup
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+from qonnx.transformation.infer_datatypes import InferDataTypes
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveUniqueNodeNames,
+)
+import finn.transformation.streamline.absorb as absorb
+from onnx import helper
+import torch
+import torch.nn as nn
+import brevitas.nn as qnn
+import numpy as np
+test_fpga_part = "xczu3eg-sbva484-1-e"
+target_clk_ns = 5
+export_onnx_path = "pytest_quantsoftmax_dut.onnx"
+
+class QuantSoftMaxSimple(nn.Module):
+    def __init__(self, bit_width=8, signed=True):
+        super(QuantSoftMaxSimple, self).__init__()
+        self.output_identity = qnn.QuantIdentity(bit_width=bit_width, scaling_per_tensor=True, bias=False, signed = signed)
+        self.softmax = nn.Softmax(dim=3) # softmax along the last dimension
+
+    def get_quant_scale(self):
+        return self.output_identity.quant_act_scale()
+
+    def forward(self, x):
+        x = self.softmax(x)
+        x = self.output_identity(x)
+        return x
+
+def create_model(io_shape=(1, 12, 128, 128), idt=DataType["INT8"]):
+    '''
+    Create a quantized softmax model.
+    Input and output are quantized to Int8ActPerTensorFloat, this is to make sure
+    that the softmax layer is followed by a Quant node.
+    '''
+    dut = QuantSoftMaxSimple(idt.bitwidth(), idt.signed())
+    input = torch.rand(io_shape)
+    export_qonnx(dut, input, export_onnx_path, opset_version=11)
+    qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)
+    # set the model input to UINT8
+    model = ModelWrapper(export_onnx_path)
+    model.set_tensor_datatype(model.graph.input[0].name, idt)
+    return model, dut.get_quant_scale()
+
+def make_single_quantsoftmax_modelwrapper(impl_style="hls", simd=1, idt=DataType["UINT8"], ifm_dim=(128, 128), channels=12):
+    '''
+    Create a single quantized softmax node with variable parameters.
+    this is before SpecializeLayers() transformation.
+    '''
+    h = ifm_dim[1]
+    w = ifm_dim[2]
+
+    inp = helper.make_tensor_value_info("global_in", TensorProto.FLOAT, [1, h, w, channels])
+    outp = helper.make_tensor_value_info("global_out", TensorProto.FLOAT, [1, h, w, channels])
+    new_node = helper.make_node(
+        "QuantSoftmax",
+        ["global_in"],
+        ["global_out"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ifm_dim=[h, w],
+        channels=channels,
+        data_type = idt.name,
+        simd=simd,
+        preferred_impl_style=impl_style,
+    )
+    graph = helper.make_graph(
+        [new_node],
+        "softmax_graph",
+        inputs=[inp],
+        outputs=[outp]
+    )
+    model = qonnx_make_model(graph)
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("global_in", idt)
+    model.set_tensor_datatype("global_out", idt)
+
+    return model
+
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim", "stitched_ip"])
+@pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
+@pytest.mark.fpgadataflow
+def test_convert_to_hw_softmax_layer(exec_mode, simd):
+    '''
+    This test checks that the softmax layer can be converted to a HW layer.
+    '''
+    if (exec_mode == "stitched_ip" or exec_mode == "rtlsim") and simd != "simd1":
+        pytest.skip("Skipping this test to avoid long test times")
+    # Create the qonnx model
+    io_shape = (1, 12, 128, 128)
+    # input = torch.randn(io_shape)
+    input = gen_finn_dt_tensor(DataType["UINT8"], io_shape)
+    input_t = {"global_in": input}
+
+    model, _ = create_model(io_shape)
+
+    simd = int(simd[-1])
+    folding_config = {
+        "Defaults": {},
+        "QuantSoftmax_0": {
+            "simd": simd,
+            "preferred_impl_style": "hls"
+        }
+    }
+    try:
+        model = model.transform(ConvertQONNXtoFINN())
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        model = model.transform(to_hw.InferQuantSoftmax())
+        model = model.transform(GiveUniqueNodeNames())
+        # isolate fpga dataflow layers
+        parent_model = model.transform(CreateDataflowPartition())
+        sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
+        sdp_node_path = getCustomOp(sdp_node).get_nodeattr("model")
+        model = ModelWrapper(sdp_node_path)
+        model = model.transform(ApplyConfig(folding_config))
+        model = model.transform(SpecializeLayers(test_fpga_part))
+        model = model.transform(GiveUniqueNodeNames())
+        if exec_mode == "cppsim":
+            model = model.transform(SetExecMode("cppsim"))
+            model = model.transform(PrepareCppSim())
+            model = model.transform(CompileCppSim())
+        elif exec_mode == "rtlsim":
+            model = model.transform(SetExecMode("rtlsim"))
+            model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+            model = model.transform(HLSSynthIP())
+            try:
+                model = model.transform(PrepareRTLSim())
+                pytest.fail("PrepareRTLSim should have failed")
+            except Exception as e:
+                # expected to fail because this node do not support rtlsim
+                pass
+        elif exec_mode == "stitched_ip":
+            model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+            model = model.transform(HLSSynthIP())
+            model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    except Exception as e:
+        pytest.fail(f"Failed to transform the model: {str(e)}")
+
+
+@pytest.mark.parametrize("impl_style", ["hls"])
+@pytest.mark.parametrize("simd", ["simd1", "simd2", "simd3", "simd4"])
+@pytest.mark.parametrize("idt", ["INT8"])
+@pytest.mark.parametrize("ifm_dim", [(1, 12, 12, 12), (1, 128, 128, 384)])
+@pytest.mark.fpgadataflow
+def test_fpga_dataflow_quantsoftmax(impl_style, simd, idt, ifm_dim):
+    idt = DataType[idt]
+    simd = int(simd[-1])
+    io_shape = (ifm_dim[0], ifm_dim[1], ifm_dim[2], ifm_dim[3])
+    tollerance = 2
+    model = make_single_quantsoftmax_modelwrapper(impl_style=impl_style, simd=simd, idt=idt, ifm_dim=ifm_dim, channels=ifm_dim[3])
+
+    if(ifm_dim[3] % 3 != 0):
+        pytest.skip(f"Skipping this test because the number of channels is not a multiple of {simd}")
+
+    input = gen_finn_dt_tensor(idt, io_shape)
+    input_t = {"global_in": input}
+
+    # Create reference values using the qonnx model
+    ref_model, scale = create_model(io_shape, idt)
+    y_ref = oxe.execute_onnx(ref_model, input_t)["global_out"]
+    y_ref = y_ref / scale
+    y_ref = y_ref.numpy()
+
+    y_out = oxe.execute_onnx(model, input_t)["global_out"]
+    assert np.allclose(y_ref, y_out, atol=tollerance), "Model output does not match expected output"
+
+    try:
+        model = model.transform(SpecializeLayers(test_fpga_part))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(SetExecMode("cppsim"))
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+    except Exception as e:
+        pytest.fail(f"Failed to transform the model: {str(e)}")
+
+    # run the model
+    y_hw = oxe.execute_onnx(model, input_t)["global_out"]
+
+    # Debug prints to help identify the failing values
+    for i in range(len(y_ref)):
+        for j in range(len(y_ref[i])):
+            for k in range(len(y_ref[i][j])):
+                for l in range(len(y_ref[i][j][k])):
+                    if np.allclose(y_ref[i][j][k][l], y_hw[i][j][k][l], atol=tollerance) == False:
+                        print(f"|  {i},{j},{k},{l:<2}  |  {y_ref[i][j][k][l]:<4.0f} | {y_hw[i][j][k][l]:<4.0f} | {y_ref[i][j][k][l] - y_hw[i][j][k][l]:<4.0f} |")
+
+    assert np.allclose(y_ref, y_hw, atol=tollerance), "Model output does not match expected output"
\ No newline at end of file
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index e4dd49fc7f..fe7ba3d9fb 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -129,14 +129,14 @@ def make_single_multithresholding_modelwrapper(
         [1, 2, 2],
     ],
 )
-@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]])
+@pytest.mark.parametrize("activation", [DataType["UINT4"], DataType["INT4"], DataType["BIPOLAR"]])
 @pytest.mark.parametrize(
     "idt_tdt_cfg",
     [
         (DataType["INT8"], DataType["INT8"]),
         (DataType["INT8"], DataType["INT9"]),
-        (DataType["UINT8"], DataType["UINT8"]),
-        (DataType["UINT8"], DataType["UINT9"]),
+        (DataType["UINT5"], DataType["UINT5"]),
+        (DataType["UINT5"], DataType["UINT6"]),
     ],
 )
 @pytest.mark.parametrize("fold", [-1, 1, 2])
@@ -184,7 +184,7 @@ def test_fpgadataflow_thresholding(
         activation_bias = 0
     else:
         activation_bias = activation.min()
-        if narrow:
+        if narrow and activation.signed():
             activation_bias += 1
 
     # Generate random thresholds and sort in ascending order
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
index 1ad695bb94..e6175ac58b 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py
@@ -122,13 +122,16 @@ def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp
 
 
 @pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.parametrize(
+    "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])]
+)
 # configuration (ch, pe)
-@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)])
+@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)])
 @pytest.mark.parametrize("narrow", [True, False])
 @pytest.mark.parametrize("per_tensor", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
-def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor):
+def test_runtime_thresholds_read(impl_style, idt_act_cfg, cfg, narrow, per_tensor):
     """Read back threshold weights during runtime
 
     1. Create random initial weights T
@@ -140,8 +143,8 @@ def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor):
     pe = cfg[1]
     n_inp_vecs = [1, 2, 2]
     hls_mem_mode = "internal_decoupled"
-    act = DataType["INT4"]
-    idt = DataType["INT16"]
+    act = idt_act_cfg[1]
+    idt = idt_act_cfg[0]
     odt = act
     n_steps = act.get_num_possible_values() - 1
     # Generate random thresholds and sort in ascending order
@@ -151,7 +154,7 @@ def test_runtime_thresholds_read(impl_style, cfg, narrow, per_tensor):
     T = sort_thresholds_increasing(T)
 
     actval = act.min()
-    if narrow:
+    if narrow and act.signed():
         actval += 1
 
     model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs, ch)
@@ -219,13 +222,16 @@ def read_weights(sim):
 
 
 @pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.parametrize(
+    "idt_act_cfg", [(DataType["INT16"], DataType["INT4"]), (DataType["UINT8"], DataType["UINT4"])]
+)
 # configuration (ch, pe)
-@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3)])
+@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 6)])
 @pytest.mark.parametrize("narrow", [True, False])
 @pytest.mark.parametrize("per_tensor", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
-def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor):
+def test_runtime_thresholds_write(impl_style, idt_act_cfg, cfg, narrow, per_tensor):
     """Write threshold weights during runtime
 
     1. Create random initial weights T_init
@@ -241,8 +247,8 @@ def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor):
 
     n_inp_vecs = [1, 2, 2]
     hls_mem_mode = "internal_decoupled"
-    act = DataType["INT4"]
-    idt = DataType["INT16"]
+    act = idt_act_cfg[1]
+    idt = idt_act_cfg[0]
 
     odt = act
     n_steps = act.get_num_possible_values() - 1
@@ -253,7 +259,7 @@ def test_runtime_thresholds_write(impl_style, cfg, narrow, per_tensor):
     T_init = sort_thresholds_increasing(T_init)
 
     actval = act.min()
-    if narrow:
+    if narrow and act.signed():
         actval += 1
 
     model = make_single_thresholding_modelwrapper(