diff --git a/AUTHORS.rst b/AUTHORS.rst index 861b81924b..5a11497fc8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -28,3 +28,9 @@ Contributors * Matthias Gehre (@mgehre-amd) * Hugo Le Blevec (@hleblevec) * Patrick Geel (@patrickgeel) +* John Monks (@jmonks-amd) +* Tim Paine (@timkpaine) +* Linus Jungemann (@LinusJungemann) +* Shashwat Khandelwal (@shashwat1198) +* Ian Colbert (@i-colbert) +* Rachit Garg (@rstar900) diff --git a/CHANGELOG.rst b/CHANGELOG.rst deleted file mode 100644 index 226e6f5931..0000000000 --- a/CHANGELOG.rst +++ /dev/null @@ -1,10 +0,0 @@ -========= -Changelog -========= - -Version 0.1 -=========== - -- Feature A added -- FIX: nasty bug #1729 fixed -- add your changes here! diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d376a1b42b..5e34624790 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,6 +29,60 @@ Please follow the steps below and be sure that your contribution complies with o 1. The main branch should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break. 2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the development branch. -3. We will review your contribution and, if any additional fixes or modifications are +3. Sign Your Work + +Please use the *Signed-off-by* line at the end of your patch which indicates that you accept the Developer Certificate of Origin (DCO) defined by https://developercertificate.org/ reproduced below:: + +``` + Developer Certificate of Origin + Version 1.1 + + Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + 1 Letterman Drive + Suite D4700 + San Francisco, CA, 94129 + + Everyone is permitted to copy and distribute verbatim copies of this + license document, but changing it is not allowed. + + + Developer's Certificate of Origin 1.1 + + By making a contribution to this project, I certify that: + + (a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + + (b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + + (c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + + (d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +You can enable Signed-off-by automatically by adding the `-s` flag to the `git commit` command. + +Here is an example Signed-off-by line which indicates that the contributor accepts DCO: + +``` + This is my commit message + + Signed-off-by: Jane Doe +``` + +4. We will review your contribution and, if any additional fixes or modifications are necessary, may provide feedback to guide you. When accepted, your pull request will be merged to the repository. If you have more questions please contact us. diff --git a/LICENSE.txt b/LICENSE.txt index 278564a5a4..cec78d6043 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,5 @@ -Copyright (c) 2020, Xilinx +Copyright (C) 2020-2022, Xilinx, Inc. +Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 2e1faf8f0c..0856701908 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,12 @@ -drawing +drawing [![GitHub Discussions](https://img.shields.io/badge/discussions-join-green)](https://github.com/Xilinx/finn/discussions) [![ReadTheDocs](https://readthedocs.org/projects/finn/badge/?version=latest&style=plastic)](http://finn.readthedocs.io/) -FINN is an experimental framework from Xilinx Research Labs to explore deep neural network -inference on FPGAs. +FINN is an experimental framework from Integrated Communications and AI Lab of AMD Research & Advanced Development to explore deep neural network inference on FPGAs. It specifically targets quantized neural networks, with emphasis on generating dataflow-style architectures customized for each network. @@ -28,7 +27,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s ## Documentation -You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience. +You can view the documentation on [readthedocs](https://finn.readthedocs.io). Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience. ## Community diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 9d7ca809db..2ceb1f4195 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -1,4 +1,5 @@ -# Copyright (c) 2021, Xilinx +# Copyright (C) 2021-2022, Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,7 +28,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FROM ubuntu:jammy-20230126 -LABEL maintainer="Yaman Umuroglu " +LABEL maintainer="Jakoba Petri-Koenig , Yaman Umuroglu " ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt" diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst index 950b601f98..0a1c788324 100644 --- a/docs/finn/brevitas_export.rst +++ b/docs/finn/brevitas_export.rst @@ -8,11 +8,11 @@ Brevitas Export :scale: 70% :align: center -FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_. Brevitas provides an export of a quantized network in ONNX representation in several flavors. -Two of the Brevitas-exported ONNX variants can be ingested by FINN: - - * FINN-ONNX: Quantized weights exported as tensors with additional attributes to mark low-precision datatypes. Quantized activations exported as MultiThreshold nodes. - * QONNX: All quantization is represented using Quant, BinaryQuant or Trunc nodes. QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn` +FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_. +Brevitas provides an export of a quantized network in QONNX representation, which is the format that can be ingested by FINN. +In a QONNX graph, all quantization is represented using Quant, BinaryQuant or Trunc nodes. +QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn`. FINN-ONNX is the intermediate representation (IR) FINN uses internally. +In this IR, quantized weights are indicated through tensors with additional attributes to mark low-precision datatypes and quantized activations are expressed as MultiThreshold nodes. To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN. diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst index 8c37479a28..110a522847 100644 --- a/docs/finn/command_line.rst +++ b/docs/finn/command_line.rst @@ -20,7 +20,7 @@ two command line entry points for productivity and ease-of-use: Jupyter notebook as a starting point, visualizing the model at intermediate steps and adding calls to new transformations as needed. Once you have a working flow, you can implement a command line entry for this - by using the "advanced mode" described here. + by using the "advanced mode". Simple dataflow build mode @@ -28,7 +28,7 @@ Simple dataflow build mode This mode is intended for simpler networks whose topologies resemble the FINN end-to-end examples. -It runs a fixed build flow spanning tidy-up, streamlining, HLS conversion +It runs a fixed build flow spanning tidy-up, streamlining, HW conversion and hardware synthesis. It can be configured to produce different outputs, including stitched IP for integration in Vivado IPI as well as bitfiles. @@ -43,7 +43,9 @@ To use it, first create a folder with the necessary configuration and model file 3. Create a JSON file with the build configuration. It must be named ``dataflow_build_dir/dataflow_build_config.json``. Read more about the build configuration options on :py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig`. You can find an example .json file under ``src/finn/qnn-data/build_dataflow/dataflow_build_config.json`` -4. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``. +4. (Optional) create a JSON file with the specialize layers configuration. It must be named ``dataflow_build_dir/specialize_layers_config.json`` + You can find an example .json file under ``src/finn/qnn-data/build_dataflow/specialize_layers_config.json``. +5. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``. You can find an example .json file under ``src/finn/qnn-data/build_dataflow/folding_config.json``. Instead of specifying the folding configuration, you can use the `target_fps` option in the build configuration to control the degree of parallelization for your network. @@ -59,25 +61,28 @@ as it goes through numerous steps: .. code-block:: none - Building dataflow accelerator from /home/maltanar/sandbox/build_dataflow/model.onnx + Building dataflow accelerator from build_dataflow/model.onnx Outputs will be generated at output_tfc_w1a1_Pynq-Z1 Build log is at output_tfc_w1a1_Pynq-Z1/build_dataflow.log - Running step: step_tidy_up [1/16] - Running step: step_streamline [2/16] - Running step: step_convert_to_hls [3/16] - Running step: step_create_dataflow_partition [4/16] - Running step: step_target_fps_parallelization [5/16] - Running step: step_apply_folding_config [6/16] - Running step: step_generate_estimate_reports [7/16] - Running step: step_hls_codegen [8/16] - Running step: step_hls_ipgen [9/16] - Running step: step_set_fifo_depths [10/16] - Running step: step_create_stitched_ip [11/16] - Running step: step_measure_rtlsim_performance [12/16] - Running step: step_make_pynq_driver [13/16] - Running step: step_out_of_context_synthesis [14/16] - Running step: step_synthesize_bitfile [15/16] - Running step: step_deployment_package [16/16] + Running step: step_qonnx_to_finn [1/19] + Running step: step_tidy_up [2/19] + Running step: step_streamline [3/19] + Running step: step_convert_to_hw [4/19] + Running step: step_create_dataflow_partition [5/19] + Running step: step_specialize_layers [6/19] + Running step: step_target_fps_parallelization [7/19] + Running step: step_apply_folding_config [8/19] + Running step: step_minimize_bit_width [9/19] + Running step: step_generate_estimate_reports [10/19] + Running step: step_hw_codegen [11/19] + Running step: step_hw_ipgen [12/19] + Running step: step_set_fifo_depths [13/19] + Running step: step_create_stitched_ip [14/19] + Running step: step_measure_rtlsim_performance [15/19] + Running step: step_out_of_context_synthesis [16/19] + Running step: step_synthesize_bitfile [17/19] + Running step: step_make_pynq_driver [18/19] + Running step: step_deployment_package [19/19] You can read a brief description of what each step does on @@ -99,6 +104,7 @@ The following outputs will be generated regardless of which particular outputs a * ``build_dataflow.log`` is the build logfile that will contain any warnings/errors * ``time_per_step.json`` will report the time (in seconds) each build step took * ``final_hw_config.json`` will contain the final (after parallelization, FIFO sizing etc) hardware configuration for the build +* ``template_specialize_layers_config.json`` is an example json file that can be used to set the specialize layers config * ``intermediate_models/`` will contain the ONNX file(s) produced after each build step @@ -206,3 +212,5 @@ You can launch the desired custom build flow using: This will mount the specified folder into the FINN Docker container and launch the build flow. If ```` is not specified it will default to ``build`` and thus execute ``build.py``. If it is specified, it will be ``.py``. + +If you would like to learn more about advance builder settings, please have a look at `our tutorial about this topic `_. diff --git a/docs/finn/conf.py b/docs/finn/conf.py index 47ba99fb5f..a4416706c2 100644 --- a/docs/finn/conf.py +++ b/docs/finn/conf.py @@ -19,7 +19,7 @@ # -- Project information ----------------------------------------------------- project = "FINN" -copyright = "2020, Xilinx" +copyright = "2020-2022, Xilinx, 2022-2024, AMD" author = "Y. Umuroglu and J. Petri-Koenig" diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst index 1e1c48e2b5..3b182b8db8 100644 --- a/docs/finn/developers.rst +++ b/docs/finn/developers.rst @@ -10,7 +10,7 @@ Power users may also find this information useful. Prerequisites ================ -Before starting to do development on FINN it's a good idea to start +Before starting to do development on FINN it is a good idea to start with understanding the basics as a user. Going through all of the :ref:`tutorials` is strongly recommended if you haven't already done so. Additionally, please review the documentation available on :ref:`internals`. @@ -61,7 +61,7 @@ further detailed below: Docker images =============== -If you want to add new dependencies (packages, repos) to FINN it's +If you want to add new dependencies (packages, repos) to FINN it is important to understand how we handle this in Docker. The finn.dev image is built and launched as follows: @@ -70,7 +70,7 @@ The finn.dev image is built and launched as follows: 2. run-docker.sh launches the build of the Docker image with `docker build` (unless ``FINN_DOCKER_PREBUILT=1``). Docker image is built from docker/Dockerfile.finn using the following steps: - * Base: PyTorch dev image + * Base: Ubuntu 22.04 LTS image * Set up apt dependencies: apt-get install a few packages for verilator and * Set up pip dependencies: Python packages FINN depends on are listed in requirements.txt, which is copied into the container and pip-installed. Some additional packages (such as Jupyter and Netron) are also installed. * Install XRT deps, if needed: For Vitis builds we need to install the extra dependencies for XRT. This is only triggered if the image is built with the INSTALL_XRT_DEPS=1 argument. @@ -84,9 +84,9 @@ The finn.dev image is built and launched as follows: 4. Entrypoint script (docker/finn_entrypoint.sh) upon launching container performs the following: - * Source Vivado settings64.sh from specified path to make vivado and vivado_hls available. - * Download PYNQ board files into the finn root directory, unless they already exist. - * Source Vitits settings64.sh if Vitis is mounted. + * Source Vivado settings64.sh from specified path to make vivado and vitis_hls available. + * Download board files into the finn root directory, unless they already exist or ``FINN_SKIP_BOARD_FILES=1``. + * Source Vitis settings64.sh if Vitis is mounted. 5. Depending on the arguments to run-docker.sh a different application is launched. run-docker.sh notebook launches a Jupyter server for the tutorials, whereas run-docker.sh build_custom and run-docker.sh build_dataflow trigger a dataflow build (see documentation). Running without arguments yields an interactive shell. See run-docker.sh for other options. @@ -106,7 +106,7 @@ Linting We use a pre-commit hook to auto-format Python code and check for issues. See https://pre-commit.com/ for installation. Once you have pre-commit, you can install the hooks into your local clone of the FINN repo. -It's recommended to do this **on the host** and not inside the Docker container: +It is recommended to do this **on the host** and not inside the Docker container: :: @@ -119,7 +119,7 @@ you may have to fix it manually, then run `git commit` once again. The checks are configured in .pre-commit-config.yaml under the repo root. Testing -======= +======== Tests are vital to keep FINN running. All the FINN tests can be found at https://github.com/Xilinx/finn/tree/main/tests. These tests can be roughly grouped into three categories: @@ -132,7 +132,7 @@ These tests can be roughly grouped into three categories: Additionally, qonnx, brevitas and finn-hlslib also include their own test suites. The full FINN compiler test suite -(which will take several hours to run and require a PYNQ board) can be executed +(which will take several hours to run) can be executed by: :: @@ -146,7 +146,7 @@ requiring Vivado or as slow-running tests: bash ./run-docker.sh quicktest -When developing a new feature it's useful to be able to run just a single test, +When developing a new feature it is useful to be able to run just a single test, or a group of tests that e.g. share the same prefix. You can do this inside the Docker container from the FINN root directory as follows: @@ -178,16 +178,9 @@ FINN provides two types of documentation: * manually written documentation, like this page * autogenerated API docs from Sphinx -Everything is built using Sphinx, which is installed into the finn.dev -Docker image. You can build the documentation locally by running the following -inside the container: - -:: - - python setup.py docs +Everything is built using Sphinx. -You can view the generated documentation on build/html/index.html. -The documentation is also built online by readthedocs: +The documentation is built online by readthedocs: * finn.readthedocs.io contains the docs from the master branch * finn-dev.readthedocs.io contains the docs from the dev branch diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst index 0a022067c3..8fafde5a5e 100644 --- a/docs/finn/end_to_end_flow.rst +++ b/docs/finn/end_to_end_flow.rst @@ -2,7 +2,11 @@ End-to-End Flow *************** -The following image shows an example end-to-end flow in FINN, starting from a trained PyTorch/Brevitas network and going all the way to a running FPGA accelerator. +The following image shows an example end-to-end flow in FINN for a PYNQ board. +Please note that you can build an IP block for your neural network **for every Xilinx-AMD FPGA**, but we only provide automatic system integration for a limited number of boards. +However, you can use Vivado to integrate an IP block generated by FINN into your own design. + +The example flow in this image starts from a trained PyTorch/Brevitas network and goes all the way to a running FPGA accelerator. As you can see in the picture, FINN has a high modularity and has the property that the flow can be stopped at any point and the intermediate result can be used for further processing or other purposes. This enables a wide range of users to benefit from FINN, even if they do not use the whole flow. .. image:: ../../notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst index ef4457f53a..70c2f24ed2 100644 --- a/docs/finn/faq.rst +++ b/docs/finn/faq.rst @@ -7,16 +7,6 @@ Frequently Asked Questions Can't find the answer to your question here? Check `FINN GitHub Discussions `_. -Can I install FINN out of the Docker container? - We do not support out of the Docker implementations at the moment. This is due - to the high complexity of the FINN project dependencies. - -Since FINN uses ONNX, can I compile any model from the ONNX Model Zoo to an FPGA accelerator? - The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer - types and quantization annotations. Networks must be first quantized using Brevitas and exported - to FINN-ONNX to be converted to FPGA accelerators. - - Can I install FINN out of the Docker container? We do not support out of the Docker implementations at the moment. This is due to the high complexity of the FINN project dependencies. @@ -52,7 +42,6 @@ What operating systems are supported by FINN? FINN should work fine under any Linux-based OS capable of running Vivado/Vitis, as long as you install Docker (``docker-ce``) on your machine. - I am getting DocNav and Model_Composer errors when launching the Docker image. We do not mount those particular directories into the Docker container because they are not used. The errors are Vivado related but you can safely ignore them. @@ -74,16 +63,8 @@ How can I target an arbitrary Xilinx FPGA without PYNQ support? Why does FINN-generated architectures need FIFOs between layers? See https://github.com/Xilinx/finn/discussions/383 -How do I tell FINN to utilize DSPs instead of LUTs for MAC operations in particular layers? - This is done with the ``resType="dsp"`` attribute on ``MatrixVectorActivation`` and ``Vector_Vector_Activate`` instances. - When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’ - folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`). - This is a good idea for layers with more weight/input act bits and high PE*SIMD. - See the `MobileNet-v1 build config for ZCU104 in finn-examples `_ for reference. - - How do I tell FINN to utilize a particular type of memory resource in particular layers? - This is done with the ``ram_style`` attribute. Check the particular ``HLSCustomOp`` attribute definition to see + This is done with the ``ram_style`` attribute. Check the particular ``HWCustomOp`` attribute definition to see which modes are supported (`example for MatrixVectorActivation `_). When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’ folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`). diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 6bb0f3ab1a..eae61b1a55 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -8,7 +8,7 @@ Quickstart ========== 1. Install Docker to run `without root `_ -2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.1``) +2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.2``) 3. Clone the FINN compiler from the repo: ``git clone https://github.com/Xilinx/finn/`` and go into the directory where it is cloned 4. Execute ``./run-docker.sh quicktest`` to verify your installation. 5. Optionally, follow the instructions on :ref:`PYNQ board first-time setup` or :ref:`Alveo first-time setup` for board setup. @@ -28,8 +28,8 @@ to train *customized* networks and create highly-efficient FPGA implementations In general, the approach for using the FINN framework is as follows: 1. Train your own quantized neural network (QNN) in `Brevitas `_. We have some `guidelines `_ on quantization-aware training (QAT). -2. Export to FINN-ONNX by following `this tutorial `_ . -3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_ +2. Export to QONNX and convert to FINN-ONNX by following `this tutorial `_ . +3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_ or for advanced settings have a look at this `tutorial `_ . 4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results. Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide. @@ -49,13 +49,12 @@ Running FINN in Docker ====================== FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources `_ to get started. You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well. -If you want to use prebuilt images, read :ref:`Using a prebuilt image`. The above mentioned script to build and launch the FINN docker container is called `run-docker.sh `_ . It can be launched in the following modes: Launch interactive shell ************************ -Simply running sh run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation: +Simply running bash run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation: :: @@ -93,11 +92,12 @@ This will launch the `Jupyter notebook `_ server inside a Environment variables ********************** -Prior to running the `run-docker.sh` script, there are several environment variables you can set to configure certain aspects of FINN. -These are summarized below: +Prior to running the ``run-docker.sh`` script, there are several environment variables you can set to configure certain aspects of FINN. +For a complete list, please have a look in the `run-docker.sh `_ file. +The most relevant are summarized below: * (required) ``FINN_XILINX_PATH`` points to your Xilinx tools installation on the host (e.g. ``/opt/Xilinx``) -* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.1``) +* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.2``) * (required for Alveo) ``PLATFORM_REPO_PATHS`` points to the Vitis platform files (DSA). * (required for Alveo) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``). * (optional) ``NUM_DEFAULT_WORKERS`` (default 4) specifies the degree of parallelization for the transformations that can be run in parallel, potentially reducing build time @@ -108,10 +108,8 @@ These are summarized below: * (optional) ``NETRON_PORT`` (default 8081) changes the port for Netron inside Docker * (optional) ``PYNQ_BOARD`` or ``ALVEO_BOARD`` specifies the type of PYNQ/Alveo board used (see "supported hardware" below) for the test suite * (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests. -* (optional) ``FINN_DOCKER_PREBUILT`` (default 0) if set to 1 then skip Docker image building and use the image tagged with ``FINN_DOCKER_TAG``. * (optional) ``FINN_DOCKER_TAG`` (autogenerated) specifies the Docker image tag to use. * (optional) ``FINN_DOCKER_RUN_AS_ROOT`` (default 0) if set to 1 then run Docker container as root, default is the current user. -* (optional) ``FINN_DOCKER_GPU`` (autodetected) if not 0 then expose all Nvidia GPUs or those selected by ``NVIDIA_VISIBLE_DEVICES`` to Docker container for accelerated DNN training. Requires `Nvidia Container Toolkit `_ * (optional) ``FINN_DOCKER_EXTRA`` (default "") pass extra arguments to the ``docker run`` command when executing ``./run-docker.sh`` * (optional) ``FINN_SKIP_DEP_REPOS`` (default "0") skips the download of FINN dependency repos (uses the ones already downloaded under deps/. * (optional) ``NVIDIA_VISIBLE_DEVICES`` (default "") specifies specific Nvidia GPUs to use in Docker container. Possible values are a comma-separated list of GPU UUID(s) or index(es) e.g. ``0,1,2``, ``all``, ``none``, or void/empty/unset. @@ -125,23 +123,11 @@ General FINN Docker tips * If you want a new terminal on an already-running container, you can do this with ``docker exec -it bash``. * The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the finn compiler folder (which is mounted from the host computer) or otherwise backed up. -Using a prebuilt image -********************** - -By default the ``run-docker.sh`` script tries to re-build the Docker image with each run. After the first run this should go quite fast thanks to Docker caching. -If you are having trouble building the Docker image or need offline access, you can use prebuilt images by following these steps: - -1. Pull a prebuilt Docker image with ``docker pull maltanar/finn:`` where ```` can be ``dev_latest`` or ``main_latest`` -2. Set the ``FINN_DOCKER_TAG`` to the name of the image you just pulled e.g. ``FINN_DOCKER_TAG=maltanar/finn:dev_latest`` -3. Set ``FINN_DOCKER_PREBUILT=1`` -4. You can now launch the Docker image in all modes without re-building or any internet access. - - Supported FPGA Hardware ======================= -**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards. +**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-o> -**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator. +**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Kria SOM, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards. PYNQ board first-time setup **************************** @@ -177,7 +163,7 @@ On the target side: On the host side: -1. Install Vitis 2022.1 and set up the ``VITIS_PATH`` environment variable to point to your installation. +1. Install Vitis 2022.2 and set up the ``VITIS_PATH`` environment variable to point to your installation. 2. Install Xilinx XRT. Ensure that the ``XRT_DEB_VERSION`` environment variable reflects which version of XRT you have installed. 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)* 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above. @@ -201,7 +187,7 @@ System Requirements * Ubuntu 18.04 with ``bash`` installed * Docker `without root `_ -* A working Vitis/Vivado 2022.1 installation +* A working Vitis/Vivado 2022.2 installation * ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables correctly set, see `Quickstart`_ * *(optional)* `Vivado/Vitis license`_ if targeting non-WebPack FPGA parts. * *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_ diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst index a5c486935d..9e34edc9d1 100644 --- a/docs/finn/hw_build.rst +++ b/docs/finn/hw_build.rst @@ -8,7 +8,7 @@ Hardware Build and Deployment :scale: 70% :align: center -A model where all layers have been converted to HLS layers can be processed by +A model where all layers have been converted to either HLS or RTL layers can be processed by FINN to build a bitfile and driver targeting a Zynq or Alveo system or to generate a Vivado IP Integrator (IPI) design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. @@ -69,9 +69,11 @@ FINN will descend into each partition and insert FIFO nodes between streaming no where FIFO depths dictated by the node attributes, using the :py:mod:`finn.transformation.fpgadataflow.insert_fifo.InsertFIFO` transformation. Afterwards, IP blocks will be created for each partition, which in turn contain the -IP blocks for each layer stitched together. The layer-level IP blocks -are generated by Vivado HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` +IP blocks for HLS layers and RTL modules for RTL layers stitched together. The layer-level IP blocks for HLS layers +are generated by Vitis HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` and :py:mod:`finn.transformation.fpgadataflow.hlssynth_ip.HLSSynthIP` transformations. +For RTL layers calling :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` will fill out the RTL wrapper files and store all files belonging to the RTL module in a folder. + The top-level IP blocks are generated in Vivado IPI, using the :py:mod:`finn.transformation.fpgadataflow.create_stitched_ip.CreateStitchedIP` transformation. Vivado/Vitis Project Generation and Synthesis @@ -86,7 +88,7 @@ Deployment ========== -Deployment and Remote Execution -------------------------------- +Deployment +----------- -The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks. +The bitfile and the driver file(s) can be copied to the PYNQ board and be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks. diff --git a/docs/finn/img/finn-hw-build.png b/docs/finn/img/finn-hw-build.png index f3a591fa8f..412317b8d1 100644 Binary files a/docs/finn/img/finn-hw-build.png and b/docs/finn/img/finn-hw-build.png differ diff --git a/docs/finn/img/finn-stack.png b/docs/finn/img/finn-stack.png index e34b1ecb45..c2b49de57e 100644 Binary files a/docs/finn/img/finn-stack.png and b/docs/finn/img/finn-stack.png differ diff --git a/docs/finn/img/nw-prep.png b/docs/finn/img/nw-prep.png index bed56ebc6d..28a7c9d3ff 100755 Binary files a/docs/finn/img/nw-prep.png and b/docs/finn/img/nw-prep.png differ diff --git a/docs/finn/img/repo-structure.png b/docs/finn/img/repo-structure.png index 704e5e5bda..05db9d201c 100644 Binary files a/docs/finn/img/repo-structure.png and b/docs/finn/img/repo-structure.png differ diff --git a/docs/finn/index.rst b/docs/finn/index.rst index c13bf81cec..ab9cc96fb1 100644 --- a/docs/finn/index.rst +++ b/docs/finn/index.rst @@ -5,21 +5,21 @@ FINN Welcome to the FINN Read the Docs website! What is FINN? -============= +============== .. image:: img/finn-stack.png - :scale: 40% + :scale: 15% :align: center 'FINN' is colloquially used to refer to two separate but highly related things: -* The FINN **project**, which is an experimental framework from Xilinx Research Labs - to explore deep neural network inference on FPGAs. It specifically targets - quantized neural networks (QNNs), with emphasis on generating dataflow-style +* The FINN **project**, which is an experimental framework from AMD Research and + Advanced Development (RAD) to explore deep neural network inference on FPGAs. + It specifically targets quantized neural networks (QNNs), with emphasis on generating dataflow-style architectures customized for each network. The key components are illustrated in the figure above; including tools for training quantized neural networks (Brevitas), the FINN compiler, and the finn-hlslib - Vivado HLS library of FPGA components for QNNs. + Vitis HLS library of FPGA components for QNNs. Read more on the `FINN project homepage `_. * The FINN **compiler**, which this Read the Docs website is the documentation for. diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst index a3d18bed77..825fafb0b6 100644 --- a/docs/finn/internals.rst +++ b/docs/finn/internals.rst @@ -27,8 +27,6 @@ Custom Operations/Nodes FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" or domain="qonnx.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`. -.. note:: See the description of `this PR `_ for more on how the operator wrapper library is organized. - Custom ONNX Execution Flow ========================== @@ -137,7 +135,7 @@ ModelWrapper contains more useful functions, if you are interested please have a Analysis Pass ============= -An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis` . +An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis`. .. _transformation_pass: @@ -148,26 +146,26 @@ A transformation passes changes (transforms) the given model, it gets the model .. _mem_mode: -MatrixVectorActivation *mem_mode* -================================== +HLS variant of MatrixVectorActivation: *mem_mode* +================================================= FINN supports three types of the so-called *mem_mode* attrıbute for the node MatrixVectorActivation. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently three settings for the *mem_mode* are supported in FINN: -* "const" +* "internal_embedded" (former "const" mode) -* "decoupled" +* "internal_decoupled" (former "decoupled" mode) * "external" -The following picture shows the idea behind the "const" and "decoupled" mode. +The following picture shows the idea behind the "internal_embedded" and "internal_decoupled" mode. .. image:: img/mem_mode.png :scale: 55% :align: center -Const mode ----------- -In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these. +Internal_embedded mode +------------------------ +In *internal_embedded* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *internal_embedded* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these. Advantages: @@ -175,17 +173,15 @@ Advantages: * easier to debug layer in cppsim since no additional components -* well-tested and mature components - Disadvantages: * can lead to very long HLS synthesis times for certain weight array shapes * less control over the weight memory FPGA primitives, Vivado HLS doesn't always make the best resource allocation decisions -Decoupled mode --------------- -In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode. +Internal_decoupled mode +------------------------ +In *internal_decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *internal_embedded* mode. Advantages: @@ -197,14 +193,12 @@ Advantages: Disadvantages: -* somewhat less well-tested compared to the const mode - -* higher resource footprint due to additional weight streamer and weight FIFO +* slightly higher resource footprint due to additional weight streamer and weight FIFO How to set *mem_mode* --------------------- -When the nodes in the network are converted to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the conversion to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is passed as argument. Note that if no argument is passed, the default is *const*. +When the nodes in the network are specialized to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the specialization to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is set in the node attributes of the nodes and can be passed as part of the folding configuration. The default is *internal_decoupled*. .. _folding_factors: @@ -217,46 +211,43 @@ Constraints to folding factors per layer * - **Layers** - **Parameters** - **Constraints** - * - Addstreams_Batch + * - Addstreams - PE - inp_channels % PE == 0 - * - ChannelwiseOp_Batch + * - ChannelwiseOp - PE - channels % PE == 0 * - ConvolutionInputGenerator - SIMD - inp_channels % SIMD == 0 - * - ConvolutionInputGenerator1d - - SIMD - - inp_channels % SIMD == 0 * - Downsampler - SIMD - inp_channels % SIMD == 0 - * - DuplicateStreams_Batch + * - DuplicateStreams - PE - channels % PE == 0 - * - Eltwise + * - StreamingEltwise - PE - inp_channels % PE == 0 - * - FMPadding_batch + * - FMPadding - SIMD - inp_channels % SIMD == 0 - * - FMPadding_rtl + * - FMPadding_Pixel - SIMD - inp_channels % SIMD == 0 - * - Globalaccpool_Batch + * - Globalaccpool - PE - channels % PE == 0 - * - Labelselect_Batch + * - Labelselect - PE - num_labels % PE == 0 * - MatrixVectorActivation - PE & SIMD - MH % PE == 0 & MW % SIMD == 0 - * - Pool_Batch + * - Pool - PE - inp_channels % PE == 0 - * - Thresholding_Batch + * - Thresholding - PE - MH % PE == 0 * - VectorVectorActivation @@ -280,9 +271,6 @@ This RTL version is an alternative to the original `HLS implementation `_. -When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS custom nodes. A single node can be executed using one or more of the following methods: +When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS/RTL custom nodes. A single node can be executed using one or more of the following methods: Simulation using Python ======================= -This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS custom nodes, so right after the streamlining transformations and before the nodes are converted into HLS layers. +This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS/RTL custom nodes yet, so right after the streamlining transformations and before the nodes are specialized into HLS/RTL layers. Simulation using C++ ==================== @@ -26,7 +26,7 @@ This simulation can be used for a model containing several HLS custom operations Emulation using PyVerilator =========================== -The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. +The emulation using PyVerilator can be used when IP blocks/RTL modules were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this: - for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename. diff --git a/docs/requirements.txt b/docs/requirements.txt index 26c05d0025..85bc1d0dcd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,7 +2,9 @@ brevitas@git+https://github.com/Xilinx/brevitas@master#egg=brevitas_examples dataclasses-json==0.5.7 docutils==0.17.1 gspread==3.6.0 +importlib_resources IPython +matplotlib netron pytest pyverilator@git+https://github.com/maltanar/pyverilator@master#egg=pyverilator @@ -10,4 +12,5 @@ qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx sphinx_rtd_theme==0.5.0 torch torchvision +tqdm vcdvcd diff --git a/fetch-repos.sh b/fetch-repos.sh index a81b746921..073c052d67 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -30,7 +30,7 @@ QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f" FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2" BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db" -PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" +PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3" OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a" diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v new file mode 100644 index 0000000000..11cef604e0 --- /dev/null +++ b/finn-rtllib/fifo/hdl/Q_srl.v @@ -0,0 +1,308 @@ +// original source: +// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v + + +// Copyright (c) 1999 The Regents of the University of California +// Copyright (c) 2010 The Regents of the University of Pennsylvania +// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London +// Copyright (c) 2020 Xilinx +// +// Permission to use, copy, modify, and distribute this software and +// its documentation for any purpose, without fee, and without a +// written agreement is hereby granted, provided that the above copyright +// notice and this paragraph and the following two paragraphs appear in +// all copies. +// +// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON +// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +// + +// Q_srl_oreg3_prefull_SIMPLE.v +// +// - In-page queue with parameterizable depth, bit width +// - Stream I/O is triple (data, valid, back-pressure), +// with EOS concatenated into the data +// - Flow control for input & output is combinationally decoupled +// - 2 <= depth <= 256 +// * (depth >= 2) is required to decouple I/O flow control, +// where empty => no produce, full => no consume, +// and depth 1 would ping-pong between the two at half rate +// * (depth <= 256) can be modified +// by changing ''synthesis loop_limit X'' below +// and changing ''addrwidth'' or its log computation +// - 1 <= width +// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice, +// plus output register (for fast output) +// - Queue addressing is done by ''addr'' up-down counter +// - Queue fullness is checked by comparator (addr==depth) +// - Queue fullness is pre-computed for next cycle +// - Queue input back-pressure is pre-computed for next cycle +// - Queue output valid (state!=state__empty) is pre-computed for next cycle +// (necessary since SRL data output reg requires non-boolean state) +// - FSM has 3 states (empty, one, more) +// - When empty, continue to emit most recently emitted value (for debugging) +// +// - Queue slots used = / (state==state_empty) ? 0 +// | (state==state_one) ? 1 +// \ (state==state_more) ? addr+2 +// - Queue slots used <= depth +// - Queue slots remaining = depth - used +// = / (state==state_empty) ? depth +// | (state==state_one) ? depth-1 +// \ (state==state_more) ? depth-2-addr +// +// - Synplify 7.1 / 8.0 +// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05 + + +`ifdef Q_srl +`else +`define Q_srl + + +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount); + + parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) + parameter width = 16; // - width of data (i_d, o_d) + + parameter addrwidth = $clog2(depth); + + input clock; + input reset; + + input [width-1:0] i_d; // - input stream data (concat data + eos) + input i_v; // - input stream valid + output i_r; // - input stream ready + wire i_b; // - input stream back-pressure + + output [width-1:0] o_d; // - output stream data (concat data + eos) + output o_v; // - output stream valid + input o_r; // - output stream ready + wire o_b; // - output stream back-pressure + + output [addrwidth:0] count; // - output number of elems in queue + output [addrwidth:0] maxcount; // - maximum observed count since reset + + reg [addrwidth:0] maxcount_reg; // - maximum count seen until now + reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address + // for data output + reg shift_en_; // - SRL16 shift enable + reg [width-1:0] srl [depth-2:0]; // - SRL16 memory + reg shift_en_o_; // - SRLO shift enable + reg [width-1:0] srlo_, srlo // - SRLO output reg + /* synthesis syn_allow_retiming=0 */ ; + + parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED + parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo + parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo + // #items in srl = addr+2 + + reg [1:0] state, state_; // - state register + + wire addr_full_; // - true iff addr==depth-2 on NEXT cycle + reg addr_full; // - true iff addr==depth-2 + wire addr_zero_; // - true iff addr==0 + wire o_v_reg_; // - true iff state_empty on NEXT cycle + reg o_v_reg // - true iff state_empty + /* synthesis syn_allow_retiming=0 */ ; + wire i_b_reg_; // - true iff !full on NEXT cycle + reg i_b_reg // - true iff !full + /* synthesis syn_allow_retiming=0 */ ; + + assign addr_full_ = (state_==state_more) && (addr_==depth-2); + // - queue full + assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0) + assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty + assign i_b_reg_ = addr_full_; // - input bp if full + assign o_d = srlo; // - output data from queue + assign o_v = o_v_reg; // - output valid if non-empty + assign i_b = i_b_reg; // - input bp if full + assign maxcount = maxcount_reg; + + assign i_r = !i_b; + assign o_b = !o_r; + + assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0)); + + // - ''always'' block with both FFs and SRL16 does not work, + // since FFs need reset but SRL16 does not + + always @(posedge clock) begin // - seq always: FFs + if (reset) begin + state <= state_empty; + addr <= 0; + addr_full <= 0; + o_v_reg <= 0; + + i_b_reg <= 0; + maxcount_reg <= 0; + + end + else begin + state <= state_; + addr <= addr_; + addr_full <= addr_full_; + o_v_reg <= o_v_reg_; + i_b_reg <= i_b_reg_; + maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg); + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srlo + // - infer enabled output reg at end of shift chain + // - input first element from i_d, all subsequent elements from SRL16 + if (reset) begin + srlo <= 0; + end + else begin + if (shift_en_o_) begin + srlo <= srlo_; + end + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srl + // - infer enabled SRL16E from shifting srl array + // - no reset capability; srl[] contents undefined on reset + if (shift_en_) begin + // synthesis loop_limit 256 + for (a_=depth-2; a_>0; a_=a_-1) begin + srl[a_] = srl[a_-1]; + end + srl[0] <= i_d; + end + end // always @ (posedge clock or negedge reset) + + always @* begin // - combi always + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + case (state) + + state_empty: begin // - (empty, will not produce) + if (i_v) begin // - empty & i_v => consume + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else begin // - empty & !i_v => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end + + state_one: begin // - (contains one) + if (i_v && o_b) begin // - one & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && o_b) begin // - one & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end // case: state_one + + state_more: begin // - (contains more than one) + if (addr_full || (depth==2)) begin + // - (full, will not consume) + // - (full here if depth==2) + if (o_b) begin // - full & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else begin // - full & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; +// addr_ <= addr-1; +// state_ <= state_more; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end + else begin // - (mid: neither empty nor full) + if (i_v && o_b) begin // - mid & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= addr+1; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end // else: !if(addr_full) + end // case: state_more + + default: begin + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + end // case: default + + endcase // case(state) + end // always @ * + +endmodule // Q_srl + + +`endif // `ifdef Q_srl diff --git a/finn-rtllib/fifo/hdl/fifo_template.v b/finn-rtllib/fifo/hdl/fifo_template.v new file mode 100644 index 0000000000..3f14ae991f --- /dev/null +++ b/finn-rtllib/fifo/hdl/fifo_template.v @@ -0,0 +1,72 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module $TOP_MODULE_NAME$( +//- Global Control ------------------ +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET = ap_rst_n" *) +(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) +input ap_rst_n, + +output $COUNT_RANGE$ count, +output $COUNT_RANGE$ maxcount, + +//- AXI Stream - Input -------------- +output in0_V_TREADY, +input in0_V_TVALID, +input $IN_RANGE$ in0_V_TDATA, + +//- AXI Stream - Output -------------- +input out_V_TREADY, +output out_V_TVALID, +output $OUT_RANGE$ out_V_TDATA +); + +Q_srl #( +.depth($DEPTH$), +.width($WIDTH$) +) +impl +( + .clock(ap_clk), + .reset(!ap_rst_n), + .count(count), + .maxcount(maxcount), + .i_d(in0_V_TDATA), + .i_v(in0_V_TVALID), + .i_r(in0_V_TREADY), + .o_d(out_V_TDATA), + .o_v(out_V_TVALID), + .o_r(out_V_TREADY) +); + +endmodule diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_template.v b/finn-rtllib/fmpadding/hdl/fmpadding_template.v index 0b0f40f86a..2347d9b394 100644 --- a/finn-rtllib/fmpadding/hdl/fmpadding_template.v +++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v @@ -31,10 +31,11 @@ module $TOP_MODULE_NAME$( //- Global Control ------------------ -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) -input ap_clk, -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) -input ap_rst_n, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite, ASSOCIATED_RESET = ap_rst_n" *) +(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) +input ap_rst_n, //- AXI Lite ------------------------ // Writing @@ -86,7 +87,7 @@ fmpadding_axi #( .INIT_YOFF($INIT_YOFF$), .INIT_YEND($INIT_YEND$) ) -$TOP_MODULE_NAME$_impl +impl ( .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv new file mode 100644 index 0000000000..0ac2628ee5 --- /dev/null +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -0,0 +1,527 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48. + *****************************************************************************/ + +module mvu_4sx4u #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights + input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+3)/4; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 4*c; + localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + localparam int unsigned PE_REM = 4*(c+1) - PE_END; + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][3]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; + logic [29:0] aa; + logic [26:0] dd; + logic [ 1:0] xx[3:1]; + if(1) begin : blkVectorize + uwire [3:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin + if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe + PE_REM][1]), + .O5(xx[pe + PE_REM][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe + PE_REM]+:3] = ww[pe]; + aa[D[pe + PE_REM]+ 3] = ww[pe][3]; + end + end + end : blkVectorize + + uwire [47:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [17:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [45:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [47:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav +`ifndef VERILATOR + else begin : genDSP + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase + end : genDSP +`endif + + // External Canary Pipeline + logic [1:0] X1[3:1] = '{ default: 0 }; + logic [1:0] X2[3:1] = '{ default: 0 }; + logic [1:0] X3[3:1] = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + foreach(X3[i]) begin + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); + end + end + end + + // Derive actual cross-lane overflows + for(genvar i = 0; i < 3; i++) begin + assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; + end + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -8:0] hi4[3]; + uwire [$clog2(SIMD)+7:0] lo4[3]; + for(genvar i = 0; i < 4; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i >= PE_REM && i < 3) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + end + assign hi4[i] = Hi4; + end : genHi + else if (i < 3) begin : genHiZero + assign hi4[i] = '0; + end : genHiZero + + // Conclusive low part accumulation + if(i >= PE_REM) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 3) assign up4 = Lo4; + else assign lo4[i] = Lo4; + end : blkLo + else begin : blkLoZero + assign lo4[i] = '0; + end : blkLoZero + + end + + // Stage #5: Resolve lane totals + logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[3] <= up4 - hi4[2]; + Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; + end + + end : genPipes + +endmodule : mvu_4sx4u diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv new file mode 100644 index 0000000000..fbf48784f0 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -0,0 +1,525 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48. + *****************************************************************************/ + +module mvu_8sx8u_dsp48 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+1)/2; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 2*c; + localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + localparam int unsigned PE_REM = 2*(c+1) - PE_END; + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [29:0] aa; + logic [26:0] dd; + logic [ 1:0] xx; + if(1) begin : blkVectorize + uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin + if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + end + end + end : blkVectorize + + uwire [47:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [17:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [45:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [47:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav +`ifndef VERILATOR + else begin : genDSP + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase + end : genDSP +`endif + + // External Canary Pipeline + logic [1:0] X1 = '{ default: 0 }; + logic [1:0] X2 = '{ default: 0 }; + logic [1:0] X3 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); + end + end + + // Derive actual cross-lane overflows + assign h3[s] = pp[D[1]+:2] - X3; + + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; + uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; + + // Conclusive high part accumulation + if(PE_REM == 0) begin : genHi + localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + end + assign hi4 = Hi4; + end : genHi + else begin : genHiZero + assign hi4 = '0; + end : genHiZero + + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + // Conclusive low part accumulation + if(i >= PE_REM) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 1) assign up4 = Lo4; + else assign lo4 = Lo4; + end : blkLo + else begin : blkLoZero + assign lo4 = '0; + end : blkLoZero + + end + + // Stage #5: Resolve lane totals + logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[1] <= up4 - hi4; + Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; + end + + end : genPipes + +endmodule : mvu_8sx8u_dsp48 diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv new file mode 100644 index 0000000000..3bbc7051b9 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -0,0 +1,430 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix/Vector Vector Unit (MVU/VVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_vvu_8sx9_dsp58 #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0, + + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, + localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD + ) + ( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p + ); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + +//-------------------- Declare global signals --------------------\\ + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator + +//-------------------- Shift register for opmode select signal --------------------\\ + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). + // Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end + end + assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; + end + end + end; + +//-------------------- Buffer for input activations --------------------\\ + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[i][EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + //w[i][3*j +: LANES_OCCUPIED]; + w[SIMD*i+3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; + end + end + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genExternalPregWeight + else begin : genInpDSPWeight + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genInpDSPWeight + end : genWeightSIMD + end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ + for (genvar i=0; i0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[i] = pp[ACCU_WIDTH-1:0]; + end + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[i][j]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[i][j-1]; + end + end + else assign Preg = Mreg + pcout[i][j-1]; + end + assign pp = Preg; + assign pcout[i][j] = Preg; + end : genBehav +`ifndef VERILATOR + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP +`endif + end : genDSPChain + end : genDSPPE + +endmodule : mvu_vvu_8sx9_dsp58 diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv new file mode 100644 index 0000000000..6498530113 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -0,0 +1,367 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. + * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 achieving 4 MACs/DSP, + * - (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * - [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * Folding hints: + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated + *****************************************************************************/ + +module mvu_vvu_axi #( + bit IS_MVU, + parameter COMPUTE_CORE, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned SEGMENTLEN = 0, + + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + + bit PUMPED_COMPUTE = 0, + bit FORCE_BEHAVIORAL = 0, + bit M_REG_LUT = 1, + + // Safely deducible parameters + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE*ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8, + localparam bit SIMD_UNEVEN = SIMD % 2 +)( + // Global Control + input logic ap_clk, + input logic ap_clk2x, // synchronous, double-speed clock; only used for PUMPED_COMPUTE + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end + end + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end + end + + uwire clk = ap_clk; + uwire clk2x = ap_clk2x; + uwire rst = !ap_rst_n; + + //- Replay to Accommodate Neuron Fold ----------------------------------- + typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + uwire mvu_flatin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = MH/PE; + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + + //- Unflatten inputs into structured matrices --------------------------- + localparam int unsigned ACT_PE = IS_MVU? 1 : PE; + typedef logic [PE -1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + + uwire mvu_w_t mvu_w = s_axis_weights_tdata; + + //- Conditional Activations Layout Adjustment for VVU + uwire mvu_a_t amvau_i; + if (IS_MVU || (PE == 1)) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i) + for(genvar pe = 0; pe < ACT_PE; pe++) begin + for(genvar simd = 0; simd < SIMD; simd++) begin + assign amvau_i[pe][simd] = amvau[simd*ACT_PE+pe]; + end + end + end : genVVUInput + + //- Flow Control Bracket around Compute Core ---------------------------- + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //- Conditionally Pumped DSP Compute ------------------------------------ + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire ovld; + uwire dsp_p_t odat; + if(1) begin : blkDsp + localparam int unsigned EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; + localparam int unsigned DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1); + typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; + typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; + + uwire dsp_clk; + uwire dsp_en; + + uwire dsp_last; + uwire dsp_zero; + uwire dsp_w_t dsp_w; + uwire dsp_a_t dsp_a; + + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + if(!PUMPED_COMPUTE) begin : genUnpumpedCompute + assign dsp_clk = clk; + assign dsp_en = en; + + assign dsp_last = alast && avld; + assign dsp_zero = !istb; + assign dsp_w = mvu_w; + assign dsp_a = amvau_i; + + assign ovld = dsp_vld; + assign odat = dsp_p; + end : genUnpumpedCompute + else begin : genPumpedCompute + assign dsp_clk = clk2x; + + // Identify second fast cycle just before active slow clock edge + logic Active = 0; + if(1) begin : blkActive + uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk)); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0])); + always_ff @(posedge clk2x) Active <= clk_lut[1]; + end : blkActive + + // The input for a slow cycle is split across two fast cycles along the SIMD dimension. + // - Both fast cycles are controlled by the same enable state. + // - A zero cycle is duplicated across both fast cycles. + // - The last flag must be restricted to the second fast cycle. + + dsp_w_t W = 'x; + for(genvar pe = 0; pe < PE; pe++) begin : genPERegW + + uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0] w; + for(genvar i = 0; i < SIMD; i++) assign w[i] = mvu_w[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) W[pe] <= 'x; + else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegW + + dsp_a_t A = 'x; + for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA + + uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] a; + for(genvar i = 0; i < SIMD; i++) assign a[i] = amvau_i[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) A[pe] <= 'x; + else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegA + + logic Zero = 1; + logic Last = 0; + always_ff @(posedge clk2x) begin + if(rst) begin + Zero <= 1; + Last <= 0; + end + else if(en) begin + Zero <= !istb; + Last <= alast && avld && Active; + end + end + + assign dsp_en = en; + assign dsp_last = Last; + assign dsp_zero = Zero; + assign dsp_w = W; + assign dsp_a = A; + + // Since no two consecutive last cycles will ever be asserted on the input, + // valid outputs will also always be spaced by, at least, one other cycle. + // We can always hold a captured output for two cycles to allow the slow + // clock to pick it up. + logic Vld = 0; + dsp_p_t P = 'x; + always_ff @(posedge clk2x) begin + if(rst) begin + Vld <= 0; + P <= 'x; + end + else if(en) begin + if(dsp_vld) P <= dsp_p; + Vld <= dsp_vld || (Vld && !Active); + end + end + assign ovld = Vld; + assign odat = P; + + end : genPumpedCompute + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + + end : blkDsp + +//-------------------- Output register slice --------------------\\ + // Make `en`computation independent from external inputs. + // Drive all outputs from registers. + struct packed { + logic rdy; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x }; // ultimate output register + + assign en = A.rdy; + uwire b_load = !B.vld || m_axis_output_tready; + + always_ff @(posedge clk) begin + if(rst) begin + A <= '{ rdy: 1, default: 'x }; + B <= '{ vld: 0, default: 'x }; + end + else begin + if(A.rdy) A.dat <= odat; + A.rdy <= (A.rdy && !ovld) || b_load; + + if(b_load) begin + B <= '{ + vld: ovld || !A.rdy, + dat: A.rdy? odat : A.dat + }; + end + end + end + assign m_axis_output_tvalid = B.vld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; + +endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v new file mode 100644 index 0000000000..50c15c1b02 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -0,0 +1,97 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU & VVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter IS_MVU = $IS_MVU$, + parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter PUMPED_COMPUTE = 0, + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + // (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *) + // (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *) + // input ap_clk2x, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // Weight Stream + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, + // Input Stream + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, + // Output Stream + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY +); + +mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) inst ( + .ap_clk(ap_clk), + .ap_clk2x(1'b0), // wired to ground since double-pumped compute not enabled through FINN for now + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) +); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv new file mode 100644 index 0000000000..3e2766f63d --- /dev/null +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -0,0 +1,181 @@ +/****************************************************************************** + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Replay buffer for counted sequences on an AXI-lite stream. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer #( + int unsigned LEN, // Sequence length + int unsigned REP, // Sequence replay count + int unsigned W // Data width +)( + input logic clk, + input logic rst, + + input logic [W-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [W-1:0] odat, + output logic olast, + output logic ofin, + output logic ovld, + input logic ordy +); + + if(LEN == 0) initial begin + $error("%m: Illegal zero sequence LEN."); + $finish; + end + if(REP == 0) initial begin + $error("%m: Illegal zero REP count."); + $finish; + end + + // Track position in Sequence + uwire last_item; + uwire shift; + if(LEN == 1) assign last_item = 1; + else begin + typedef logic [$clog2(LEN)-1:0] count_t; + count_t Count = 0; + logic Last = 0; + always_ff @(posedge clk) begin + if(rst) begin + Count <= 0; + Last <= 0; + end + else if(shift) begin + Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1); + Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last); + end + end + assign last_item = Last; + end + + if(REP == 1) begin + assign shift = ivld && ordy; + + assign irdy = ordy; + assign odat = idat; + assign olast = last_item; + assign ofin = last_item; + assign ovld = ivld; + end + else begin + + // Track Repetitions + uwire last_rep; + if(1) begin : blkRep + typedef logic [$clog2(REP)-1:0] rep_t; + rep_t RepCnt = 0; + logic RepLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + RepCnt <= 0; + RepLst <= 0; + end + else if(last_item && shift) begin + RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1); + RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst); + end + end + assign last_rep = RepLst; + end : blkRep + + localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN); + typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB + typedef logic [W -1:0] data_t; + + // Output Registers + data_t ODat; + logic OVld = 0; + logic OLst = 'x; + logic OFin = 'x; + assign odat = ODat; + assign olast = OLst; + assign ofin = OFin; + assign ovld = OVld; + + // Buffer Memory Management + data_t Mem[2**AWIDTH]; + ptr_t WP = 0; // Write Pointer + ptr_t RP = 0; // Read Pointer + ptr_t FP = 0; // Free Pointer + + // Operational Guards + // Occupancy: WP-FP + // WP-FP < 2**AWIDTH -> writing allowed + // - increments WP + // Availability: WP-RP + // WP-RP > 0 -> reading allowed + // - increments RP, last in sequence rewinds to FP for non-final repetition + // - increments FP in last repetition + assign irdy = !((WP-FP) >> AWIDTH); + + uwire wr = irdy && ivld; + uwire rd = !OVld || ordy; + always_ff @(posedge clk) begin + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; + end + + uwire vld = (RP != WP); + assign shift = rd && vld; + always_ff @(posedge clk) begin + if(rst) begin + WP <= 0; + RP <= 0; + FP <= 0; + + OVld <= 0; + OLst <= 'x; + OFin <= 'x; + end + else begin + if(wr) WP <= WP + 1; + if(rd) begin + if(vld) begin + automatic logic rewind = last_item && !last_rep; + RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1); + FP <= FP + last_rep; + end + + OVld <= vld; + OLst <= last_item; + OFin <= last_rep && last_item; + end + end + end + + end + +endmodule : replay_buffer diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..34b5d8eb53 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule : mvu_8sx9_tb diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv new file mode 100644 index 0000000000..4ed7b4bf5f --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -0,0 +1,229 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI wrapper module. + *****************************************************************************/ + +module mvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 1; + localparam string COMPUTE_CORE = "mvu_4sx4u"; + localparam int unsigned MW = 120; + localparam int unsigned MH = 40; + localparam int unsigned SIMD = 20; + localparam int unsigned PE = 10; + localparam int unsigned SEGMENTLEN = 2.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 0; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_axi_tb diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv new file mode 100644 index 0000000000..108980c497 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv @@ -0,0 +1,142 @@ +module mvu_dsp58_tb; + + localparam int unsigned N = 1000; + + localparam int unsigned MW = 12; + localparam int unsigned MH = 4; + localparam int unsigned PE = 2; + localparam int unsigned SIMD = 6; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 8; + localparam int unsigned ACCU_WIDTH = 24; + + //- Global Control ------------------ + logic clk = 1; + logic clk2x = 1; + always #5ns clk = !clk; + always #2.5ns clk2x = !clk2x; + + logic rst = 1; + initial begin + repeat(8) @(posedge clk); + rst <= 0; + end + + //- DUTs ---------------------------- + + // Weight Stream + logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata; + logic s_axis_weights_tvalid[2]; + uwire s_axis_weights_tready[2]; + + // Input Stream + logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata; + logic s_axis_input_tvalid[2]; + uwire s_axis_input_tready[2]; + + // Output Stream + uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata[2]; + uwire m_axis_output_tvalid[2]; + logic m_axis_output_tready[2]; + + for(genvar i = 0; i < 2; i++) begin : genDUTs + mvu_vvu_axi #( + .IS_MVU(1), + .COMPUTE_CORE("mvu_vvu_8sx9_dsp58"), + .MW(MW), .MH(MH), + .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .PUMPED_COMPUTE(i) + ) dut ( + .ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst), + .s_axis_weights_tdata, .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]), + .s_axis_input_tdata, .s_axis_input_tvalid (s_axis_input_tvalid [i]), .s_axis_input_tready (s_axis_input_tready [i]), + .m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i]) + ); + end : genDUTs + + + //- Stimuli ------------------------- + + // Weight Feed + initial begin + s_axis_weights_tvalid = '{ default: 0 }; + s_axis_weights_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)*(MW/SIMD)) begin + automatic type(s_axis_weights_tdata) weights; + std::randomize(weights); + s_axis_weights_tdata <= weights; + s_axis_weights_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_weights_tready[0]); + s_axis_weights_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_weights_tready[1]); + s_axis_weights_tvalid[1] <= 0; + end + join + end + end + + // Input Feed + initial begin + s_axis_input_tvalid = '{ default: 0 }; + s_axis_input_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MW/SIMD)) begin + automatic type(s_axis_input_tdata) in; + std::randomize(in); + s_axis_input_tdata <= in; + s_axis_input_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_input_tready[0]); + s_axis_input_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_input_tready[1]); + s_axis_input_tvalid[1] <= 0; + end + join + end + end + + // Output Capture and Comparison + initial begin + m_axis_output_tready = '{ default: 0 }; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)) begin + automatic type(m_axis_output_tdata) res; + m_axis_output_tready <= '{ default: 1 }; + fork + begin + @(posedge clk iff m_axis_output_tvalid[0]); + m_axis_output_tready[0] <= 0; + res[0] = m_axis_output_tdata[0]; + end + begin + @(posedge clk iff m_axis_output_tvalid[1]); + m_axis_output_tready[1] <= 0; + res[1] = m_axis_output_tdata[1]; + end + join + assert(res[0] == res[1]) else begin + $error("Output mismatch: %0x <=> %0x", res[0], res[1]); + $stop; + end + while($urandom()%7 < MW/SIMD) @(posedge clk); // Occassional backpressure + end + + $display("Test completed."); + $finish; + end + +endmodule : mvu_dsp58_tb diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv new file mode 100644 index 0000000000..5581354e0e --- /dev/null +++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for replay_buffer module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer_tb; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + uwire rst = 0; + + // DUT Geometries + localparam int unsigned DIMS[3] = '{ 7, 8, 10 }; + localparam int unsigned W = 8; + typedef logic [W-1:0] data_t; + + bit [2**$size(DIMS)-1:0] done = 0; + always_comb begin + if(&done) begin + $display("Test completed."); + $finish; + end + end + + // Parallel DUT Instantiations + for(genvar r = 0; r < $size(DIMS); r++) begin + for(genvar l = 0; l < $size(DIMS); l++) begin + localparam int unsigned REP = DIMS[r]; + localparam int unsigned LEN = DIMS[l]; + + data_t idat; + logic ivld; + uwire irdy; + + uwire data_t odat; + uwire olast; + uwire ofin; + uwire ovld; + logic ordy; + + replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut ( + .clk, .rst, + .idat, .ivld, .irdy, + .odat, .olast, .ofin, .ovld, .ordy + ); + + // Input Feed: 0, 1, ..., 10*LEN-1 + initial begin + idat = 'x; + ivld = 0; + @(posedge clk iff !rst); + + for(int unsigned i = 0; i < 10*LEN; i++) begin + idat <= i; + ivld <= 1; + @(posedge clk iff irdy); + idat <= 'x; + ivld <= 0; + while($urandom()%(REP-1) != 0) @(posedge clk); + end + end + + // Output Check + initial begin + automatic int unsigned base = 0; + + ordy = 0; + @(posedge clk iff !rst); + + for(int unsigned k = 0; k < 10; k++) begin + for(int unsigned j = 0; j < REP; j++) begin + for(int unsigned i = 0; i < LEN; i++) begin + ordy <= 1; + @(posedge clk iff ovld); + assert(odat == base+i) else begin + $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i); + $stop; + end + assert(olast == (i == LEN-1)) else begin + $error("#%0d.%0d: Last mismatch.", r, l); + $stop; + end + assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin + $error("#%0d.%0d: Fin mismatch.", r, l); + $stop; + end + + ordy <= 0; + while($urandom()%13 == 0) @(posedge clk); + end + end + base += LEN; + end + + done[$size(DIMS)*r + l] <= 1; + end + end + end + +endmodule : replay_buffer_tb diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv new file mode 100644 index 0000000000..853dcc6e17 --- /dev/null +++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv @@ -0,0 +1,227 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for VVU AXI wrapper module. + *****************************************************************************/ + +module vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 0; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 25; // Kernel*Kernel + localparam int unsigned MH = 4; // Channels + localparam int unsigned SIMD = 1; // MW%SIMD == 0 + localparam int unsigned PE = 1; // MH%PE == 0 + localparam int unsigned SEGMENTLEN = 1.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[NF*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : vvu_axi_tb diff --git a/finn-rtllib/thresholding/hdl/axilite_if.v b/finn-rtllib/thresholding/hdl/axilite_if.v new file mode 100644 index 0000000000..2aeff770d2 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/axilite_if.v @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module axi4lite_if +#( + parameter ADDR_WIDTH = 32, + parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64 + parameter IP_DATA_WIDTH = 64//can be any power-of-2 multiple of DATA_WIDTH +) +( +//system signals +input aclk, +input aresetn,//active low, asynchronous assertion and synchronous deassertion + +//Write channels +//write address +output reg awready, +input awvalid, +input [ADDR_WIDTH-1:0] awaddr, +input [2:0] awprot, +//write data +output reg wready, +input wvalid, +input [DATA_WIDTH-1:0] wdata, +input [(DATA_WIDTH/8)-1:0] wstrb, +//burst response +input bready, +output reg bvalid, +output reg [1:0] bresp,//NOTE: 00 = OKAY, 10 = SLVERR (write error) + +//Read channels +//read address +output reg arready, +input arvalid, +input [ADDR_WIDTH-1:0] araddr, +input [2:0] arprot, +//read data +input rready, +output reg rvalid, +output reg [1:0] rresp,//NOTE: 00 = OKAY, 10 = SLVERR (read error) +output reg [DATA_WIDTH-1:0] rdata, + +//IP-side interface +output reg ip_en, +output reg ip_wen, +output reg [ADDR_WIDTH-1:0] ip_addr, +output [IP_DATA_WIDTH-1:0] ip_wdata, +input ip_rack, +input [IP_DATA_WIDTH-1:0] ip_rdata +); + +localparam RESP_OKAY = 2'b00; +localparam RESP_SLVERR = 2'b10; +//get ceil(log2(ceil(IP_DATA_WIDTH/DATA_WIDTH))) +localparam NFOLDS_LOG = $clog2((IP_DATA_WIDTH + DATA_WIDTH - 1) / DATA_WIDTH); + +reg internal_ren; +reg internal_wen; +reg internal_wack; +reg [ADDR_WIDTH-1:0] internal_raddr; +reg [ADDR_WIDTH-1:0] internal_waddr; +reg [DATA_WIDTH-1:0] internal_wdata; +wire [DATA_WIDTH-1:0] internal_rdata; +reg internal_error = 0; + +//check DATA_WIDTH +initial begin + if(DATA_WIDTH != 32 & DATA_WIDTH != 64) begin + $display("AXI4Lite DATA_WIDTH must be 32 or 64"); + $finish; + end +end + +//transaction state machine +localparam STATE_IDLE = 0, + STATE_READ = 1, + STATE_WRITE = 2; + +reg [1:0] state; + +always @(posedge aclk or negedge aresetn) + if(~aresetn) + state <= STATE_IDLE; + else case(state) + STATE_IDLE: + if(awvalid & wvalid) + state <= STATE_WRITE; + else if(arvalid) + state <= STATE_READ; + STATE_READ: + if(rvalid & rready) + state <= STATE_IDLE; + STATE_WRITE: + if(bvalid & bready) + state <= STATE_IDLE; + default: state <= STATE_IDLE; + endcase + +//write-related internal signals +always @(*) begin + internal_waddr = awaddr >> $clog2(DATA_WIDTH/8); + internal_wdata = wdata; + internal_wen = (state == STATE_IDLE) & awvalid & wvalid; +end + +always @(posedge aclk) begin + awready <= internal_wen; + wready <= internal_wen; +end + +//read-related internal signals +always @(*) begin + internal_raddr = araddr >> $clog2(DATA_WIDTH/8); + internal_ren = (state == STATE_IDLE) & ~internal_wen & arvalid; +end + +always @(posedge aclk) + arready <= internal_ren; + +wire write_to_last_fold; + +always @(posedge aclk) begin + ip_wen <= write_to_last_fold; + ip_en <= internal_ren | write_to_last_fold; + if(internal_ren | write_to_last_fold) + ip_addr <= internal_ren ? (internal_raddr >> NFOLDS_LOG) : (internal_waddr >> NFOLDS_LOG); + internal_wack <= internal_wen; +end + +genvar i; +reg [(1<> (internal_rfold*DATA_WIDTH); + always @(posedge aclk) + if(internal_ren) + internal_rfold <= internal_raddr[NFOLDS_LOG-1:0]; + for(i=0; i<(1< + * + * @description + * Produces the N-bit count of those among 2^N-1 thresholds that are not + * larger than the corresponding input: + * y = Σ(T_i <= x) + * The result is computed by binary search. The runtime-configurable + * thresholds must be written in ascending order: + * i < j => T_i < T_j + * The design supports channel folding allowing each input to be processed + * with respect to a selectable set of thresholds. The corresponding + * threshold configuration relies on a channel address prefix. Inputs are + * accompanied by a channel selector. + * + * Parameter Layout as seen on AXI-Lite (row by row): + * | Base \ Offs | 0 1 2 ... 2^N-2 2^N-1 + * ---------+--------------------------------+------------------------------------ + * Chnl #0 | 0 | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #1 | 2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * + *****************************************************************************/ +module thresholding #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C, // number of channels + int unsigned PE, // parallel processing elements + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + bit USE_CONFIG = 1, + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel fold + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + // Global Control + input logic clk, + input logic rst, + + // Threshold Configuration + input logic cfg_en, + input logic cfg_we, + input logic [$clog2(CF)+$clog2(PE)+N-1:0] cfg_a, + input logic [K-1:0] cfg_d, + output logic cfg_rack, + output logic [K-1:0] cfg_q, + + // Input Stream + output logic irdy, + input logic ivld, + input logic [PE-1:0][K-1:0] idat, + + // Output Stream + input logic ordy, + output logic ovld, + output logic [PE-1:0][O_BITS-1:0] odat +); + + // Parameter Constraints Checking + initial begin + if(CF*PE != C) begin + $error("Parallelism PE=%0d is not a multiple of channel count C=%0d.", PE, C); + $finish; + end + end + + // Operations within Pipeline + typedef enum logic [1:0] { + NOP = 2'b00, // No operation + TH = 2'b01, // Thresholding + WR = 2'b11, // Write (initialization) + RB = 2'b10, // Readback (validation) + CFG = 2'b1x // Config op (pointer-preserving) + } op_e; + + // Pipeline Link Type + typedef logic [$clog2(CF)+N-1:0] ptr_t; + typedef logic [K -1:0] val_t; + typedef struct packed { + op_e op; + ptr_t ptr; // WR/RB: address; TH: result + val_t val; // WR/RB: threshold value; TH: input value + } pipe_t; + + //----------------------------------------------------------------------- + // Pipeline Feed + // - configuration always takes precedence + // - number of pending thresholding ops capped to N+3 + // across pipeline and output FIFO: pipe:N + A:1 + B:1 + 1 + localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*N + 3; + pipe_t pipe[PE][N+1]; + if(1) begin : blkFeed + + // Thresholding Input Guard ensuring Output FIFO is never overrun + logic signed [$clog2(MAX_PENDING):0] GuardSem = MAX_PENDING-1; // MAX_PENDING-1, ..., 0, -1 + uwire th_full = GuardSem[$left(GuardSem)]; + always_ff @(posedge clk) begin + if(rst) GuardSem <= MAX_PENDING-1; + else begin + automatic logic dec = !(USE_CONFIG && cfg_en) && !th_full && ivld; + automatic logic inc = ovld && ordy; + GuardSem <= GuardSem + (inc == dec? 0 : inc? 1 : -1); + end + end + + // PE Configuration Address Decoding + logic cfg_sel[PE]; + logic cfg_oob; + logic [N-1:0] cfg_ofs; + if(PE == 1) begin + assign cfg_sel[0] = 1; + assign cfg_oob = 0; + assign cfg_ofs = cfg_a[0+:N]; + end + else begin + uwire [$clog2(PE)-1:0] cfg_pe = cfg_a[N+:$clog2(PE)]; + always_comb begin + foreach(cfg_sel[pe]) begin + cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_pe == pe); + end + cfg_oob = (cfg_pe >= PE); + cfg_ofs = cfg_a[0+:N]; + if(cfg_oob && !cfg_we) begin + // Map readbacks from padded rows (non-existent PEs) to padded highest threshold index of first PE + cfg_sel[0] = 1; + cfg_ofs = '1; + end + end + end + + uwire ptr_t iptr; + assign iptr[0+:N] = cfg_ofs; + if(CF > 1) begin + // Channel Fold Rotation + logic [$clog2(CF)-1:0] CnlCnt = 0; + logic CnlLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + CnlCnt <= 0; + CnlLst <= 0; + end + else if(!(USE_CONFIG && cfg_en) && !th_full && ivld) begin + CnlCnt <= CnlCnt + (CnlLst? 1-CF : 1); + CnlLst <= CnlCnt == CF-2; + end + end + + assign iptr[N+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[N+$clog2(PE)+:$clog2(CF)] : CnlCnt; + end + + for(genvar pe = 0; pe < PE; pe++) begin + assign pipe[pe][0] = '{ + op: USE_CONFIG && cfg_en? + (!cfg_sel[pe]? NOP : cfg_we? WR : RB) : + (ivld && !th_full? TH : NOP), + ptr: iptr, + val: !(USE_CONFIG && cfg_en)? idat[pe] : cfg_we? cfg_d : 0 + }; + end + + assign irdy = !(USE_CONFIG && cfg_en) && !th_full; + end : blkFeed + + //----------------------------------------------------------------------- + // Free-Running Thresholding Pipeline + for(genvar stage = 0; stage < N; stage++) begin : genStages + + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin : genPE + uwire pipe_t p = pipe[pe][stage]; + uwire cs = (p.ptr[SN:0] == 2**SN-1); + + // Threshold Memory + val_t Thresh; // Read-out register + if(1) begin : blkThresh + localparam int unsigned DEPTH = CF * 2**stage; + localparam RAM_STYLE = + DEPTH_TRIGGER_URAM && (DEPTH >= DEPTH_TRIGGER_URAM)? "ultra" : + DEPTH_TRIGGER_BRAM && (DEPTH >= DEPTH_TRIGGER_BRAM)? "block" : + // If BRAM trigger defined, force distributed memory below if Vivado may be tempted to use BRAM nonetheless. + DEPTH_TRIGGER_BRAM && (DEPTH >= 64)? "distributed" : "auto"; + + (* RAM_STYLE = RAM_STYLE *) + val_t Threshs[DEPTH]; + if(THRESHOLDS_PATH != "") begin + initial $readmemh($sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage), Threshs); + end + + if(USE_CONFIG) begin : genThreshMem + uwire we = (p.op ==? WR) && cs; + if((CF == 1) && (stage == 0)) begin + always @(posedge clk) begin + if(we) Threshs[0] <= p.val; + end + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always @(posedge clk) begin + if(we) Threshs[addr] <= p.val; + end + end + end : genThreshMem + + if((CF == 1) && (stage == 0)) begin + assign Thresh = Threshs[0]; + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always_ff @(posedge clk) begin + Thresh <= Threshs[addr]; + end + end + + end : blkThresh + + // Pipeline State + pipe_t P = '{ op: NOP, default: 'x }; + logic Reval = 0; + always_ff @(posedge clk) begin + if(rst) begin + P <= '{ op: NOP, default: 'x }; + Reval <= 0; + end + else begin + P <= p; + Reval <= (p.op ==? RB) && cs; + end + end + + logic cmp; + if(!SIGNED) assign cmp = $unsigned(Thresh) <= $unsigned(P.val); + else if(!FPARG) assign cmp = $signed(Thresh) <= $signed(P.val); + else begin : blkSignedFloat + uwire mag_eq = Thresh[K-2:0] == P.val[K-2:0]; + uwire mag_le = Thresh[K-2:0] <= P.val[K-2:0]; + always_comb begin + unique case({Thresh[K-1], P.val[K-1]}) + 2'b00: cmp = mag_le; + 2'b01: cmp = 0; + 2'b10: cmp = 1; + 2'b11: cmp = !mag_le || mag_eq; + default: cmp = 'x; + endcase + end + end : blkSignedFloat + + // Pipeline State Update + pipe_t pp; + always_comb begin + pp = P; + if(P.op !=? CFG) pp.ptr[SN] = cmp; + if(Reval) pp.val = Thresh; + end + + // Pipeline State Forward (potentially additional register) + pipe_t pf; + if(!DEEP_PIPELINE) assign pf = pp; + else begin + pipe_t Pf = '{ op: NOP, default: 'x }; + always_ff @(posedge clk) begin + if(rst) Pf <= '{ op: NOP, default: 'x }; + else Pf <= pp; + end + assign pf = Pf; + end + + assign pipe[pe][stage+1] = pf; + + end : genPE + end : genStages + + //----------------------------------------------------------------------- + // Configuration Readback + always_comb begin + cfg_rack = 0; + cfg_q = 0; + foreach(pipe[pe]) begin + automatic pipe_t p = pipe[pe][N]; + cfg_rack |= p.op ==? RB; + cfg_q |= p.val; + end + end + + //----------------------------------------------------------------------- + // Stream Output through FIFO + // - Depth of N + Output Reg to allow pipe to drain entirely under backpressure + // - Typically mapped to an SRL shift register + if(1) begin : blkStreamOutput + localparam int unsigned A_DEPTH = MAX_PENDING - 1; + logic [PE-1 : 0][N-1 : 0] ADat[A_DEPTH]; + logic signed [$clog2(A_DEPTH):0] APtr = '1; // -1, 0, 1, ..., A_DEPTH-1 + uwire avld = !APtr[$left(APtr)]; + + logic [PE-1:0][N-1:0] BDat = 'x; + logic BVld = 0; + + uwire aload = pipe[0][N].op ==? TH; + uwire bload = !BVld || ordy; + + always_ff @(posedge clk) begin + if(aload) begin + assert(APtr < $signed(A_DEPTH-1)) else begin + $error("Overrun after failing stream guard."); + $stop; + end + foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][N].ptr; + for(int unsigned i = 1; i < A_DEPTH; i++) ADat[i] <= ADat[i-1]; + end + end + always_ff @(posedge clk) begin + if(rst) APtr <= '1; + else APtr <= APtr + (aload == (avld && bload)? 0 : aload? 1 : -1); + end + always_ff @(posedge clk) begin + if(rst) begin + BDat <= 'x; + BVld <= 0; + end + else if(bload) begin + BDat <= ADat[APtr]; + BVld <= avld; + end + end + + assign ovld = BVld; + for(genvar pe = 0; pe < PE; pe++) begin + assign odat[pe] = BDat[pe] + BIAS; + end + end : blkStreamOutput + +endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv new file mode 100644 index 0000000000..5c7182b214 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -0,0 +1,164 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief All-AXI interface adapter for thresholding module. + * @author Thomas B. Preußer + * + * @description + * This AXI adapter fits the core thresholding functionality: + * - with AXI stream data interfaces with flow control + * - with implicit round-robin channel rotation as used by FINN, and + * - performs aligned byte address to parameter word address translation. + *****************************************************************************/ + +module thresholding_axi #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C = 1, // Channels + int unsigned PE = 1, // Processing Parallelism, requires C = k*PE + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + + bit USE_AXILITE, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2, + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + //- Global Control ------------------ + input logic ap_clk, + input logic ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input logic s_axilite_AWVALID, + output logic s_axilite_AWREADY, + input logic [ADDR_BITS-1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input logic s_axilite_WVALID, + output logic s_axilite_WREADY, + input logic [31:0] s_axilite_WDATA, + input logic [ 3:0] s_axilite_WSTRB, + + output logic s_axilite_BVALID, + input logic s_axilite_BREADY, + output logic [1:0] s_axilite_BRESP, + + // Reading + input logic s_axilite_ARVALID, + output logic s_axilite_ARREADY, + input logic [ADDR_BITS-1:0] s_axilite_ARADDR, + + output logic s_axilite_RVALID, + input logic s_axilite_RREADY, + output logic [31:0] s_axilite_RDATA, + output logic [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output logic s_axis_tready, + input logic s_axis_tvalid, + input logic [((PE*K+7)/8)*8-1:0] s_axis_tdata, + + //- AXI Stream - Output ------------- + input logic m_axis_tready, + output logic m_axis_tvalid, + output logic [((PE*O_BITS+7)/8)*8-1:0] m_axis_tdata +); + + //----------------------------------------------------------------------- + // AXI-lite Configuration Interface + uwire cfg_en; + uwire cfg_we; + uwire [ADDR_BITS-3:0] cfg_a; + uwire [K -1:0] cfg_d; + uwire cfg_rack; + uwire [K -1:0] cfg_q; + + if(USE_AXILITE) begin + uwire [ADDR_BITS-1:0] cfg_a0; + axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(K)) axi ( + .aclk(ap_clk), .aresetn(ap_rst_n), + + .awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x), + .wready(s_axilite_WREADY), .wvalid(s_axilite_WVALID), .wdata(s_axilite_WDATA), .wstrb(s_axilite_WSTRB), + .bready(s_axilite_BREADY), .bvalid(s_axilite_BVALID), .bresp(s_axilite_BRESP), + + .arready(s_axilite_ARREADY), .arvalid(s_axilite_ARVALID), .araddr(s_axilite_ARADDR), .arprot('x), + .rready(s_axilite_RREADY), .rvalid(s_axilite_RVALID), .rresp(s_axilite_RRESP), .rdata(s_axilite_RDATA), + + .ip_en(cfg_en), .ip_wen(cfg_we), .ip_addr(cfg_a0), .ip_wdata(cfg_d), + .ip_rack(cfg_rack), .ip_rdata(cfg_q) + ); + assign cfg_a = cfg_a0[ADDR_BITS-3:0]; + always_ff @(posedge ap_clk) begin + assert(!ap_rst_n || !cfg_en || (cfg_a0[ADDR_BITS-2+:2] === 3'h0)) else begin + $error("%m: Spurious high address bits."); + $stop; + end + end + end + else begin + assign cfg_en = 0; + assign cfg_we = 'x; + assign cfg_a = 'x; + assign cfg_d = 'x; + end + + //----------------------------------------------------------------------- + // Kernel Implementation + thresholding #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) impl ( + .clk(ap_clk), .rst(!ap_rst_n), + + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata), + .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata) + ); + +endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v new file mode 100644 index 0000000000..f35db156f6 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v @@ -0,0 +1,121 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + * @brief Verilog wrapper for IP packaging. + */ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter N = $N$, // output precision + parameter K = $M$, // input/threshold precision + parameter C = $C$, // Channels + parameter PE = $PE$, // Processing Parallelism, requires C = k*PE + + parameter SIGNED = $SIGNED$, // signed inputs + parameter FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + parameter THRESHOLDS_PATH = $THRESHOLDS_PATH$, // Directory with initial threshold data + parameter USE_AXILITE = $USE_AXILITE$, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + parameter DEPTH_TRIGGER_URAM = $DEPTH_TRIGGER_URAM$, // if non-zero, local mems of this depth or more go into URAM (prio) + parameter DEPTH_TRIGGER_BRAM = $DEPTH_TRIGGER_BRAM$, // if non-zero, local mems of this depth or more go into BRAM + parameter DEEP_PIPELINE = $DEEP_PIPELINE$, // [bit] extra pipeline stages for easier timing closure + + parameter O_BITS = $O_BITS$ +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axilite:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input s_axilite_AWVALID, + output s_axilite_AWREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input s_axilite_WVALID, + output s_axilite_WREADY, + input [31:0] s_axilite_WDATA, + input [ 3:0] s_axilite_WSTRB, + + output s_axilite_BVALID, + input s_axilite_BREADY, + output [1:0] s_axilite_BRESP, + + // Reading + input s_axilite_ARVALID, + output s_axilite_ARREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_ARADDR, + + output s_axilite_RVALID, + input s_axilite_RREADY, + output [31:0] s_axilite_RDATA, + output [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output in0_V_TREADY, + input in0_V_TVALID, + input [((PE*K+7)/8)*8-1:0] in0_V_TDATA, + + //- AXI Stream - Output ------------- + input out_V_TREADY, + output out_V_TVALID, + output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA +); + + thresholding_axi #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), + .FPARG(FPARG), + .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), + .USE_AXILITE(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), + .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) core ( + .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), + + .s_axilite_AWVALID(s_axilite_AWVALID), .s_axilite_AWREADY(s_axilite_AWREADY), .s_axilite_AWADDR(s_axilite_AWADDR), + .s_axilite_WVALID(s_axilite_WVALID), .s_axilite_WREADY(s_axilite_WREADY), .s_axilite_WDATA(s_axilite_WDATA), .s_axilite_WSTRB(s_axilite_WSTRB), + .s_axilite_BVALID(s_axilite_BVALID), .s_axilite_BREADY(s_axilite_BREADY), .s_axilite_BRESP(s_axilite_BRESP), + + .s_axilite_ARVALID(s_axilite_ARVALID), .s_axilite_ARREADY(s_axilite_ARREADY), .s_axilite_ARADDR(s_axilite_ARADDR), + .s_axilite_RVALID(s_axilite_RVALID), .s_axilite_RREADY(s_axilite_RREADY), .s_axilite_RDATA(s_axilite_RDATA), .s_axilite_RRESP(s_axilite_RRESP), + .s_axis_tready(in0_V_TREADY), .s_axis_tvalid(in0_V_TVALID), .s_axis_tdata(in0_V_TDATA), + .m_axis_tready(out_V_TREADY), .m_axis_tvalid(out_V_TVALID), .m_axis_tdata(out_V_TDATA) + ); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv new file mode 100644 index 0000000000..ae30503f8f --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresh_gen.sv @@ -0,0 +1,75 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +module thresh_gen; + localparam int unsigned K = 9; + localparam int unsigned N = 4; + localparam int unsigned C = 6; + + typedef logic [K-1:0] thresh_t; + localparam thresh_t THRESHOLDS[C][2**N-1] = '{ + '{ 'h00, 'h01, 'h02, 'h03, 'h04, 'h05, 'h06, 'h07, 'h08, 'h09, 'h0a, 'h0b, 'h0c, 'h0d, 'h0e }, + '{ 'h10, 'h11, 'h12, 'h13, 'h14, 'h15, 'h16, 'h17, 'h18, 'h19, 'h1a, 'h1b, 'h1c, 'h1d, 'h1e }, + '{ 'h20, 'h21, 'h22, 'h23, 'h24, 'h25, 'h26, 'h27, 'h28, 'h29, 'h2a, 'h2b, 'h2c, 'h2d, 'h2e }, + '{ 'h30, 'h31, 'h32, 'h33, 'h34, 'h35, 'h36, 'h37, 'h38, 'h39, 'h3a, 'h3b, 'h3c, 'h3d, 'h3e }, + '{ 'h40, 'h41, 'h42, 'h43, 'h44, 'h45, 'h46, 'h47, 'h48, 'h49, 'h4a, 'h4b, 'h4c, 'h4d, 'h4e }, + '{ 'h50, 'h51, 'h52, 'h53, 'h54, 'h55, 'h56, 'h57, 'h58, 'h59, 'h5a, 'h5b, 'h5c, 'h5d, 'h5e } + }; + localparam THRESHOLDS_PATH = "./"; + + localparam int unsigned PE = 2; + localparam int unsigned CF = C/PE; + + for(genvar stage = 0; stage < N; stage++) begin + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin + initial begin + automatic string file = $sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); + + automatic thresh_t threshs[CF * 2**stage]; + for(int unsigned c = 0; c < CF; c++) begin + for(int unsigned i = 0; i < 2**stage; i++) begin + threshs[(c << stage) + i] = THRESHOLDS[c*PE + pe][(i<<(N-stage)) + 2**SN-1]; + end + end + + $writememh(file, threshs); + end + end + end + + // Quit after running all initializers + initial begin + #1ns; + $display("Generation done."); + $finish; + end + +endmodule : thresh_gen diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv new file mode 100644 index 0000000000..429fb7776f --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv @@ -0,0 +1,314 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_axi_tb #( + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + real M0 = 7.3, // slope of the uniform thresholding line + real B0 = 3.1, // offset of the uniform thresholding line + bit THROTTLED = 1, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2 +); + + //----------------------------------------------------------------------- + // Design Geometry + + // For each channel = [0,channel): + // M_channel = M0 + CX*channel + // B_channel = B0 + CX*channel + // Input/threshold precision computed according with the maximum posible value + localparam real CX = 1.375; + localparam int unsigned K = $clog2((2**N-1)*(M0+C*CX) + (B0+C*CX)); // unused sign + magnitude + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C); + + localparam int unsigned MST_STRM_WROUNDS = 503; + + typedef int unsigned threshs_t[C][2**N-1]; + function threshs_t init_thresholds(); + automatic threshs_t res; + for(int unsigned c = 0; c < C; c++) begin + automatic real m = M0 + c*CX; + automatic real b = B0 + c*CX; + foreach(res[c][i]) begin + res[c][i] = int'($ceil(m*i + b)); + end + end + return res; + endfunction : init_thresholds + localparam threshs_t THRESHS = init_thresholds(); + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // DUT + logic s_axilite_AWVALID; + uwire s_axilite_AWREADY; + logic [ADDR_BITS-1:0] s_axilite_AWADDR; // lowest 2 bits (byte selectors) are ignored + logic s_axilite_WVALID; + uwire s_axilite_WREADY; + logic [ 31:0] s_axilite_WDATA; + uwire s_axilite_BVALID; + logic s_axilite_BREADY; + uwire [ 1:0] s_axilite_BRESP; + logic s_axilite_ARVALID; + uwire s_axilite_ARREADY; + logic [ADDR_BITS-1:0] s_axilite_ARADDR; + uwire s_axilite_RVALID; + uwire s_axilite_RREADY = 1; + uwire [ 31:0] s_axilite_RDATA; + uwire [ 1:0] s_axilite_RRESP; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut ( + .ap_clk(clk), .ap_rst_n(!rst), + + // Configuration + .s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR, + .s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1), + .s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP, + .s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR, + .s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP, + + // Stream Processing + .s_axis_tready(irdy), .s_axis_tvalid(ivld), .s_axis_tdata(idat), + .m_axis_tready(ordy), .m_axis_tvalid(ovld), .m_axis_tdata(odat) + ); + + //----------------------------------------------------------------------- + // Input Stimuli + typedef logic [PE-1:0][K-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + input_t QW[$]; // Input Feed Tracing + addr_t QC[$]; + + int unsigned error_cnt = 0; + bit done = 0; + initial begin + // Report testbench details + $display("Testbench - tresholding K=%0d -> N=%0d", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("Channel #%0d: Thresholds = {", c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0d", THRESHS[c][i]); + $display(" }"); + end + + // Config + s_axilite_AWVALID = 0; + s_axilite_AWADDR = 'x; + s_axilite_WVALID = 0; + s_axilite_WDATA = 'x; + s_axilite_BREADY = 0; + s_axilite_ARVALID = 0; + s_axilite_ARADDR = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuration + for(int unsigned c = 0; c < C; c+=PE) begin + automatic addr_t addr = 0; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) addr[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + addr[0+:N] = t; + fork + begin + s_axilite_AWVALID <= 1; + s_axilite_AWADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_AWREADY); + s_axilite_AWVALID <= 0; + s_axilite_AWADDR <= 'x; + end + begin + s_axilite_WVALID <= 1; + s_axilite_WDATA <= THRESHS[c+pe][t]; + @(posedge clk iff s_axilite_WREADY); + s_axilite_WVALID <= 0; + s_axilite_WDATA <= 'x; + end + begin + s_axilite_BREADY <= 1; + @(posedge clk iff s_axilite_BVALID); + assert(s_axilite_BRESP == '0) else begin + $error("Error on parameter write."); + $stop; + end + s_axilite_BREADY <= 0; + end + join + end + end + end + + fork + // Intermittent configuration readback + while(!done) begin + if(($urandom()%37) != 0) begin + s_axilite_ARVALID <= 0; + s_axilite_ARADDR <= 'x; + @(posedge clk); + end + else begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + s_axilite_ARVALID <= 1; + s_axilite_ARADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_ARREADY); + + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat(N+6) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("Missing %0d outputs.", QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("Missing %0d readback replies.", QC.size()); + $stop; + end + + $display("Test completed: %0d errors in %0d tests.", error_cnt, MST_STRM_WROUNDS); + $display("========================================="); + $finish; + end + + // Output Checker ------------------------------------------------------- + + // Configuration Readback + always_ff @(posedge clk iff s_axilite_RVALID) begin + assert(s_axilite_RRESP == '0) else begin + $error("Read back error."); + $stop; + end + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(s_axilite_RDATA == exp) else begin + $error("Readback mismatch on #%0d.%0d: %0d instead of %0d", cnl, addr[0+:N], s_axilite_RDATA, exp); + $stop; + end + end + else begin + $error("Spurious readback output."); + $stop; + end + end + + // Stream Output + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("Mapped CNL=%0d DAT=%3d -> #%2d", cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (THRESHS[cnl][odat[pe]-1] <= x[pe])) && + ((odat[pe] == 2**N-1) || (x[pe] < THRESHS[cnl][odat[pe]])) + ) else begin + $error("Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("Spurious output."); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + +endmodule: thresholding_axi_tb diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv new file mode 100644 index 0000000000..1564f28f0d --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_tb.sv @@ -0,0 +1,274 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_tb #( + int unsigned K = 10, // input precision + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + + localparam int unsigned CF = C/PE // Channel Fold +); + localparam bit DEEP_PIPELINE = 1; + + localparam int unsigned MST_STRM_WROUNDS = 507; + localparam bit THROTTLED = 1; + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // Parallel Instances differing in Data Type + typedef logic [K -1:0] val_t; + typedef val_t threshs_t[C][2**N-1]; + typedef val_t [PE-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + logic [0:2] term = '0; + always_comb begin + if(&term) $finish; + end + for(genvar i = 0; i < 3; i++) begin : genTypes + localparam bit SIGNED = i>0; + localparam bit FPARG = i>1; + + //- DUT ------------------------- + logic cfg_en; + logic cfg_we; + logic [$clog2(C)+N-1:0] cfg_a; + logic [K-1:0] cfg_d; + uwire cfg_rack; + uwire [K-1:0] cfg_q; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .FPARG(FPARG), .USE_CONFIG(1), .DEEP_PIPELINE(DEEP_PIPELINE)) dut ( + .clk, .rst, + + // Configuration + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + // Stream Processing + .irdy, .ivld, .idat, + .ordy, .ovld, .odat + ); + + //- Stimulus Driver ------------- + threshs_t THRESHS; + function val_t sigord(input val_t x); + automatic val_t res = x; + if(SIGNED) begin + if(FPARG && x[K-1]) res[K-2:0] = ~x[K-2:0]; + res[K-1] = !x[K-1]; + end + return res; + endfunction : sigord + + input_t QW[$]; // Input tracing + addr_t QC[$]; // Readback tracking + int unsigned error_cnt = 0; + bit done = 0; + initial begin + + // Generate thresholds + std::randomize(THRESHS); + foreach(THRESHS[c]) begin + val_t row[2**N-1] = THRESHS[c]; + row.sort with (sigord(item)); + THRESHS[c] = row; + end + + // Report test case details + $display("[%0d] Thresholding %s%s%0d -> uint%0d", i, SIGNED? "s" : "u", FPARG? "fp" : "int", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("[%0d] Channel #%0d: Thresholds = {", i, c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0X", THRESHS[c][i]); + $display(" }"); + end + + // Config + cfg_en = 0; + cfg_we = 'x; + cfg_a = 'x; + cfg_d = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuratin + cfg_en <= 1; + cfg_we <= 1; + for(int unsigned c = 0; c < C; c+=PE) begin + if(CF > 1) cfg_a[N+$clog2(PE)+:$clog2(CF)] <= c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) cfg_a[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + cfg_a[0+:N] <= t; + cfg_d <= THRESHS[c+pe][t]; + @(posedge clk); + end + end + end + cfg_d <= 'x; + + fork + // Intermittent configuration readback + while(!done) begin + cfg_en <= 0; + cfg_we <= 'x; + cfg_a <= 'x; + @(posedge clk); + if(($urandom()%41) == 0) begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + cfg_en <= 1; + cfg_we <= 0; + cfg_a <= addr; + @(posedge clk); + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat((DEEP_PIPELINE+1)*N+8) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("[%0d] Missing %0d outputs.", i, QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("[%0d] Missing %0d readback replies.", i, QC.size()); + $stop; + end + + $display("[%0d] Test completed: %0d errors in %0d tests.", i, error_cnt, MST_STRM_WROUNDS); + $display("============================================="); + term[i] <= 1; + end + + //- Readback Checker -------------- + always_ff @(posedge clk iff cfg_rack) begin + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(cfg_q == exp) else begin + $error("[%0d] Readback mismatch on #%0d.%0d: %0d instead of %0d", i, cnl, addr[0+:N], cfg_q, exp); + $stop; + end + end + else begin + $error("[%0d] Spurious readback output.", i); + $stop; + end + end + + // Output Checker + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("[%0d] Mapped CNL=%0d DAT=%3x -> #%2d", i, cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (sigord(THRESHS[cnl][odat[pe]-1]) <= sigord(x[pe]))) && + ((odat[pe] == 2**N-1) || (sigord(x[pe]) < sigord(THRESHS[cnl][odat[pe]]))) + ) else begin + $error("[%0d] Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", i, cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("[%0d] Spurious output.", i); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + + end : genTypes + +endmodule: thresholding_tb diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb index f915b11fa0..5ed48ca6d8 100644 --- a/notebooks/advanced/0_custom_analysis_pass.ipynb +++ b/notebooks/advanced/0_custom_analysis_pass.ipynb @@ -153,9 +153,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb index 7e4989c902..91dd925b25 100644 --- a/notebooks/advanced/1_custom_transformation_pass.ipynb +++ b/notebooks/advanced/1_custom_transformation_pass.ipynb @@ -212,7 +212,7 @@ "\n", "To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 4 by default, this can be increased or decreased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n", "\n", - "In the following we want to take a closer look at the implementation using the compile transformation as example." + "In the following we want to take a closer look at the implementation using the compile transformation that is used for cpp simulation as example." ] }, { @@ -230,7 +230,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is fpgadataflow node." + "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is an hls node." ] } ], @@ -250,9 +250,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb index 636da64dd5..bdd2976412 100644 --- a/notebooks/advanced/2_custom_op.ipynb +++ b/notebooks/advanced/2_custom_op.ipynb @@ -672,7 +672,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb index 07b66da52f..8c7b97d6c6 100644 --- a/notebooks/advanced/3_folding.ipynb +++ b/notebooks/advanced/3_folding.ipynb @@ -8,7 +8,7 @@ "--------------------------------------\n", "**Note: We will utilize one of the intermediate models generated in the process of the cybersecurity end2end example**\n", "\n", - "There is a local copy of `step_convert_to_hls.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_convert_to_hls.onnx`. \n", + "There is a local copy of `step_specialize_layers.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_specialize_layers.onnx`. \n", "\n", "This notebook describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. \n", "\n", @@ -41,7 +41,7 @@ "source": [ "This notebook shows the manual version of this step and explains how these attributes can improve performance and what are their effects on resource utilization for developers who need to maximize the performance of their network. \n", "\n", - "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HLS layers. Each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n", + "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HW layers and then specialized to either HLS or RTL variants. In this example, all nodes were converted to HLS variants this means that each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n", "\n", "We will take this model to show how to set the folding factors manually and analyze the estimated execution clock cycles and the resource utilization of each layer in the network." ] @@ -56,7 +56,7 @@ "\n", "![](finn-dataflow.png)\n", "\n", - "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library.\n", + "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by RTL modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n", "\n", "Since each layer will be instantiated, we can flexibly set the parallelization of each layer and thus control resources and throughput of our network, as visualized in the image below:\n", "\n", @@ -72,11 +72,11 @@ "As discussed above, the network needs to go through a few preparation steps before it can be fed into our estimation functions.\n", "\n", "The `.onnx` file loaded here is taken from the cybersecurity end2end example notebook. \n", - "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HLS blocks. In this case, the `MatrixVectorActivation` units. \n", + "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HW blocks. In this case, the HLS variants of MatrixVectorActivation, `MVAU_hls` units. \n", "\n", "To interact with the `.onnx` file we use `ModelWrapper()`. This wrapper simplifies the access to different model attributes and allows us to apply custom transformations on the model.\n", "\n", - "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron." + "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron. Additionally, we call the transformation `GiveUniqueNodeNames` as a preparation." ] }, { @@ -87,8 +87,12 @@ "source": [ "import os\n", "from qonnx.core.modelwrapper import ModelWrapper\n", - "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\" \n", - "model = ModelWrapper(model_path)\n", + "from qonnx.transformation.general import GiveUniqueNodeNames\n", + "\n", + "model = ModelWrapper(os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\")\n", + "model = model.transform(GiveUniqueNodeNames())\n", + "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD_named_nodes.onnx\"\n", + "model.save(model_path)\n", "\n", "showInNetron(model_path)" ] @@ -106,7 +110,7 @@ "source": [ "The computational parallelism can be varied by setting the folding factors or also called parallelization parameters **PE** and **SIMD** of each layer. These parallelization attributes are subject to certain constraints and should be selected accordingly.\n", "\n", - "To see more details about how this is implemented in the `MatrixVectorActivation` layer (MVAU), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n", + "To see more details about how this is implemented in the HLS variant of the MatrixVectorActivation layer (`MVAU_hls`), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n", "\n", "![](finn-folding-mvau.png)" ] @@ -220,7 +224,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_0 uses 5 block ram and they are 83% utilized. " + "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_hls_0 uses 5 block ram and they are 83% utilized. " ] }, { @@ -262,7 +266,7 @@ "## Modify Parameters\n", "\n", "We now modify the parallelization parameters of the first network layer to reduce its latency.\n", - "We only extract the first `MatrixVectorActivation` block from the model and set the parallelization parameters manually.\n", + "We only extract the first `MVAU_hls` block from the model and set the parallelization parameters manually.\n", "\n", "In the first step, we left the `PE` & `SIMD` values for all the layers on default (=1) to establish a baseline and measure the estimated clock cycles and resource utilization for each of the individual layers.\n", "\n", @@ -277,7 +281,7 @@ "source": [ "from qonnx.custom_op.registry import getCustomOp\n", "\n", - "list_of_mvaus = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "mvau0 = list_of_mvaus[0]\n", "\n", "mvau0_inst = getCustomOp(mvau0)\n", @@ -301,7 +305,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We save the model and view it. On expanding the first `MatrixVectorActivation` we can see the updated `PE` & `SIMD` parameters for that layer." + "We save the model and view it. On expanding the first `MVAU_hls` we can see the updated `PE` & `SIMD` parameters for that layer." ] }, { @@ -418,7 +422,7 @@ "outputs": [], "source": [ "dir_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/\" \n", - "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD.onnx\")\n", + "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD_named_nodes.onnx\")\n", "model_updated = ModelWrapper(\"cybsec_PE_SIMD_modified.onnx\")" ] }, @@ -436,7 +440,7 @@ "outputs": [], "source": [ "# Original model\n", - "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -452,7 +456,7 @@ "outputs": [], "source": [ "# Updated model\n", - "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -465,7 +469,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that the input and output shape for MatrixVectorActivation_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU." + "We can see that the input and output shape for MVAU_hls_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU." ] }, { @@ -507,7 +511,7 @@ "outputs": [], "source": [ "# Original model\n", - "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -537,7 +541,7 @@ "outputs": [], "source": [ "# Updated model\n", - "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -550,7 +554,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we can see, the output stream width of MatrixVectorActivation_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by calling the transformation `InsertDWC` on our model." + "As we can see, the output stream width of MVAU_hls_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by first calling the transformation `InsertDWC` and then converting the resulting DWCs into an HLS or RTL variant by calling `SpecializeLayers`." ] }, { @@ -560,9 +564,10 @@ "outputs": [], "source": [ "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n", - "from qonnx.transformation.general import GiveUniqueNodeNames\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "\n", "model_updated = model_updated.transform(InsertDWC())\n", + "model_updated = model_updated.transform(SpecializeLayers())\n", "model_updated = model_updated.transform(GiveUniqueNodeNames())" ] }, @@ -610,7 +615,6 @@ "source": [ "layers = res_dict_dwc.keys()\n", "# replace names of layers with abbreviations\n", - "layers = [n.replace(\"MatrixVectorActivation_\", \"MVU\") for n in layers]\n", "layers = [n.replace(\"StreamingDataWidthConverter_Batch\", \"DWC\") for n in layers]" ] }, @@ -656,9 +660,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb index e748d85a1c..dccac6195d 100644 --- a/notebooks/advanced/4_advanced_builder_settings.ipynb +++ b/notebooks/advanced/4_advanced_builder_settings.ipynb @@ -9,7 +9,7 @@ "\n", "\"drawing\"\n", "\n", - "In this notebook, we'll use the FINN compiler to generate an FPGA accelerator with a streaming dataflow architecture from a small convolutional network trained on CIFAR-10. The key idea in streaming dataflow architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vitis HLS description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n", + "In this notebook, we'll use the FINN compiler to generate an FPGA accelerator with a streaming dataflow architecture from a small convolutional network trained on CIFAR-10. The key idea in streaming dataflow architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vitis HLS or RTL description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n", "These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput." ] }, @@ -32,9 +32,10 @@ "1. [Introduction to the CNV-w2a2 network](#intro_cnv)\n", "2. [Recap default builder flow](#recap_builder)\n", "3. [Build steps](#build_step)\n", - " 1. [How to make a custom build step](#custom_step)\n", - "4. [Folding configuration json](#folding_config)\n", - "5. [Additional builder arguments](#builder_arg)\n", + " 1. [How to create a custom build step](#custom_step)\n", + "4. [Specialize layers configuration json](#specialize_layers)\n", + "5. [Folding configuration json](#folding_config)\n", + "6. [Additional builder arguments](#builder_arg)\n", " 1. [Verification steps](#verify)\n", " 2. [Other builder arguments](#other_args)\n", " 3. [Examples for additional builder arguments & bitfile generation](#example_args)" @@ -198,7 +199,7 @@ "id": "d746eff3", "metadata": {}, "source": [ - "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HLS layers (`step_convert_to_hls`). Then there is a partition created from all layers that were converted to HLS layers (`step_create_dataflow_partition`), then optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step." + "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step." ] }, { @@ -217,7 +218,7 @@ "id": "bccebd0d", "metadata": {}, "source": [ - "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HLS layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HLS layers have the module `finn.custom_op.fpgadataflow`." + "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`." ] }, { @@ -227,7 +228,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hls.onnx\")" + "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hw.onnx\")" ] }, { @@ -235,7 +236,7 @@ "id": "2719cc09", "metadata": {}, "source": [ - "As you can see in the graph, the first two nodes (a MultiThreshold and Transpose node) and the last two nodes (a Mul and Add node) are not converted into HLS layers. FINN currently only converts integer only operations into HLS layers, this means only when the input, output & weights are quantized to integer the node will be converted." + "As you can see in the graph, the first two nodes (a MultiThreshold and Transpose node) and the last two nodes (a Mul and Add node) are not converted into HW layers. FINN currently only converts integer only operations into HW layers, this means only when the input, output & weights are quantized to integer the node will be converted." ] }, { @@ -253,7 +254,7 @@ "id": "6e6d942e", "metadata": {}, "source": [ - "When we click on the `global_in` in the graph, we can see that the quantization annotation does not contain a data type. If no data type is set and it can not be derived from the preceeding node, the FINN compiler automatically assumes that the data type is floating point. This is why the first node does not get converted into an HLS layer, the input is assumed to be floating point." + "When we click on the `global_in` in the graph, we can see that the quantization annotation does not contain a data type. If no data type is set and it can not be derived from the preceeding node, the FINN compiler automatically assumes that the data type is floating point. This is why the first node does not get converted into an HW layer, the input is assumed to be floating point." ] }, { @@ -274,7 +275,7 @@ "Even though in the example of the CNVw2a2, the inputs are 32x32 RGB images, so the input values are 8 bit (UINT8) \"quantized\", the input to the exported model is floating point. For training in Brevitas, these values were normalized between 0 and 1.0 and so the exported model expects floating point values as input. \n", "This means we are in scenario 2. In the next section we will develop a custom step for the FINN builder flow to add preprocessing to our network.\n", "\n", - "But before we move to the next section, let's take a look at the last two nodes in the graph that were not converted to HLS layers." + "But before we move to the next section, let's take a look at the last two nodes in the graph that were not converted to HW layers." ] }, { @@ -368,7 +369,7 @@ "id": "e9c2c97f", "metadata": {}, "source": [ - "### How to make a custom build step " + "### How to create a custom build step " ] }, { @@ -439,8 +440,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -548,8 +550,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -614,7 +617,7 @@ "id": "5cc97505", "metadata": {}, "source": [ - "Let's have a look at the model after the conversion to hls, to verify that now all layers are correctly converted." + "Let's have a look at the model after the conversion to hw, to verify that now all layers are correctly converted." ] }, { @@ -624,7 +627,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/step_convert_to_hls.onnx\")" + "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/step_convert_to_hw.onnx\")" ] }, { @@ -632,7 +635,298 @@ "id": "8fd0af6b", "metadata": {}, "source": [ - "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect_Batch` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator." + "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator." + ] + }, + { + "cell_type": "markdown", + "id": "a6edf5c4-9213-45cd-834f-615c12685d9e", + "metadata": {}, + "source": [ + "## Specialize layers configuration json " + ] + }, + { + "cell_type": "markdown", + "id": "4ae83d6e-c704-4c7f-a922-a4b470c0a55f", + "metadata": {}, + "source": [ + "The FINN compiler was developed with the assumption that the hardware blocks corresponding to the neural network layers are developed based on HLS. Although we do not want to abolish this HLS implementation at this time, it has become apparent over the years that for certain modules it makes sense to implement them in RTL. This allows us greater control over the resulting hardware and we can make optimal use of FPGA resources.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ed72aabf-0517-422f-a686-6c70e7492114", + "metadata": {}, + "source": [ + "So, with the growth of more and more RTL variants of common FINN hardware building blocks, we introduced an additional builder step called `step_specialize_layers`. In this step HW nodes get specialized to either an HLS or RTL variant of the node. " + ] + }, + { + "cell_type": "markdown", + "id": "82a2bc39-8a37-49aa-a79d-2818e66ebd11", + "metadata": {}, + "source": [ + "They get converted either based on pre-determined rules or the user provides a configuration file which contains the desired setting. If the user preference cannot be fulfilled, a warning will be printed and the implementation style will be set to a default. " + ] + }, + { + "cell_type": "markdown", + "id": "bc90b589-7a92-4996-9704-02736ac4e60e", + "metadata": {}, + "source": [ + "The builder flow step before `step_specialize_layers` generates a template json file to set the preferred implementation style per layer. We can copy it from one of the previous runs to this folder and manipulate it to pass it to a new build." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb88eb1-3f11-4343-ae7c-3e5e8cbc34dc", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n", + " specialize_layers_config = json.load(json_file)\n", + "\n", + "print(json.dumps(specialize_layers_config, indent=1))" + ] + }, + { + "cell_type": "markdown", + "id": "158d7d8c-a072-4a50-9714-43ebaefa53d1", + "metadata": {}, + "source": [ + "As you can see, each node is listed in the .json file and an empty string for the node attribute `preferred_impl_style` is instantiated by default. We can now use this .json and set the `preferred_impl_style` to pass to a new builder flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f464d35-6774-4751-80b4-b6230e501539", + "metadata": {}, + "outputs": [], + "source": [ + "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n", + " specialize_layers_config = json.load(json_file)\n", + "\n", + "# Set all preferred_impl_style to all HLS\n", + "for key in specialize_layers_config:\n", + " if \"preferred_impl_style\" in specialize_layers_config[key]:\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n", + "# Save as .json \n", + "with open(\"specialize_layers_all_hls.json\", \"w\") as jsonFile:\n", + " json.dump(specialize_layers_config, jsonFile)\n", + " \n", + "# Set SWG to RTL variant\n", + "for key in specialize_layers_config:\n", + " if \"preferred_impl_style\" in specialize_layers_config[key]:\n", + " if key.startswith(\"ConvolutionInputGenerator\"):\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"rtl\"\n", + " else:\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n", + "# Save as .json \n", + "with open(\"specialize_layers_swg_rtl.json\", \"w\") as jsonFile:\n", + " json.dump(specialize_layers_config, jsonFile)" + ] + }, + { + "cell_type": "markdown", + "id": "52592ea6-cd12-46b9-af91-5960b4749e7e", + "metadata": {}, + "source": [ + "We created two `specialize_layers_config_files`:\n", + "* One which sets all layers to `\"hls\"`\n", + "* One that sets `preferred_impl_style` for the ConvolutionInputGenerator to `\"rtl\"`" + ] + }, + { + "cell_type": "markdown", + "id": "701905d8-c5cc-4cc0-b872-156c5b9d0432", + "metadata": {}, + "source": [ + "In the following we will setup two build flows and run them to the estimate reports step. Afterwards we will investigate the intermediate .onnx files and compare the two runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22ff1a91-7ef7-44cb-86d3-60b9af7a8c5e", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## specialize_layers_config_file = \"specialize_layers_all_hls.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_all_hls\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9df41ff-ef6a-4d0e-ab36-241bb11ed241", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff617f21-6001-4bb7-9cf7-2cc2acd3fbec", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## specialize_layers_config_file = \"specialize_layers_swg_rtl.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_swg_rtl\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " specialize_layers_config_file = \"specialize_layers_swg_rtl.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f48ba95-f7b5-455b-8041-25b7341ad115", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "markdown", + "id": "bed4bedd-397d-4bd1-8531-c6ceac306715", + "metadata": {}, + "source": [ + "First we are looking into the intermediate model after `step_create_dataflow_partition` and then after `step_specialize_layers`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e64db23-98cb-494b-851f-3cc2c3847451", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_create_dataflow_partition.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e1a6351-367f-47a6-b802-a2613ea455a1", + "metadata": {}, + "source": [ + "Let's have a look first at the model which we specialize to \"all HLS\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f85d6c42-153d-4a40-b3cc-a4c8c89fe636", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "e1520920-b7de-42a5-9ec8-e8503992fbd1", + "metadata": {}, + "source": [ + "As you can see, each op type has now a suffix indicating that it is an HLS variant of the node. Additionally, when you click on one of the node in the Netron visualization, you can see that module is set to `finn.custom_op.fpgadataflow.hls`.\n", + "\n", + "Let's now have a look at the model in which we specialized the ConvolutionInputGenerator to `\"rtl\"`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f1f26a0-3a62-4920-bf40-5b1b798fa02e", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_swg_rtl/intermediate_models/step_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f9c4de4-61ef-4698-ab23-87bf5953c5ae", + "metadata": {}, + "source": [ + "You can use the cells above to try out different settings and pass it to the builder flow. Please note that not all layers have HLS and RTL variants, so it might be that the setting you define in `specialize_layers_config.json` gets ignored and a sensible default is set instead. The FINN compiler will display a warning in this case." ] }, { @@ -648,7 +942,7 @@ "id": "c164040f", "metadata": {}, "source": [ - "The FINN compiler allows the user to implement a network in streaming dataflow architecture, this means every layer is implemented individually and the data is streamed through the accelerator. We can customize each layer for specific performance and resource requirements by adjusting the parallelism and resource type of each layer. In the FINN context we refer to this customization of parallelism in each layer as folding. To learn more details about the influence of folding factors/parallelism in FINN, please have a look at our [folding tutorial](3_folding.ipynb).\n", + "The FINN compiler allows the user to implement a network in streaming dataflow architecture, this means every layer is implemented individually and the data is streamed through the accelerator. We can customize each layer for specific performance and resource requirements by adjusting the parallelism and resource type of each layer. In the FINN context we refer to this customization of parallelism in each layer as folding. To learn more details about the influence of folding factors/parallelism in FINN, please have a look at our [folding tutorial](./3_folding.ipynb).\n", "\n", "In this section, we will look into the interface over which we can influence the customization of each layer using the FINN builder tool: A json file containing the folding configuration." ] @@ -683,7 +977,7 @@ "source": [ "As you can see from the printed cell above, the keys in the .json file are the node names of the layers in our network. For each of the layers, some node attributes are listed:\n", "* `PE` and `SIMD` are the folding parameters that determine the parallelism of each layer, depending on the layer they can be set to different values, for details refer to [this table](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer).\n", - "* `mem_mode`: determines if the parameter memory will be implemented as part of the HLS code (`const`) or instantiated separately and connected with the layer over a memory streamer unit (`decoupled`). You can find more details in this part of the documentation: https://finn-dev.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode . It is also possible to set the mem_mode to external which allows for the implementation for external weights.\n", + "* `mem_mode`: determines if the parameter memory will be implemented as part of the HLS/RTL code (`const`) or instantiated separately and connected with the layer over a memory streamer unit (`decoupled`). You can find more details in this part of the documentation: https://finn-dev.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode . It is also possible to set the mem_mode to external which allows for the implementation for external weights.\n", "* `ram_style`: when selecting `decoupled` mode, the FINN compiler allows us to choose which memory resource will be used for the layer. The argument `ram_style` is set to the selected memory type:\n", " * `auto`: Vivado will make the decision if the implementation is using LUTRAM or BRAM\n", " * `distributed`: LUTRAM will be used\n", @@ -795,8 +1089,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", " \"step_generate_estimate_reports\",\n", @@ -899,8 +1194,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", " \"step_generate_estimate_reports\",\n", @@ -958,7 +1254,7 @@ "id": "97f87780", "metadata": {}, "source": [ - "The initial implementation already had a high utilization of BRAM, but the estimations went now up to 522 BRAMs while the LUT count went down to ~99k." + "The initial implementation already had a high utilization of BRAM, but the estimations went now up to ~500 BRAMs while the LUT count went down to ~99k." ] }, { @@ -1103,8 +1399,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -1239,7 +1536,7 @@ "source": [ "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments. \n", "\n", - "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. target_fps, fpga_part and folding_config_file. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments." + "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments." ] }, { @@ -1267,7 +1564,7 @@ "id": "b9bc5715", "metadata": {}, "source": [ - "You can see that after the generation of the estimate reports, the code generation and the ip generation is invoked (`step_hls_codegen` and `step_hls_ipgen`). The FIFO depths are determined and the FIFOs are inserted in the network (`step_set_fifo_depths`), we can then create an IP design of our whole network by stitching the IPs from each layer together (`step_create_stitched_ip`). At this point we have an implementation of the neural network that we can integrate within a bigger FPGA design, we can run performance measurements using simulation (`step_measure_rtlsim_performance`) and out-of-context synthesis (`step_out_of_context_synthesis`) for it.\n", + "You can see that after the generation of the estimate reports, the code generation and the ip generation is invoked (`step_hw_codegen` and `step_hw_ipgen`). The FIFO depths are determined and the FIFOs are inserted in the network (`step_set_fifo_depths`), we can then create an IP design of our whole network by stitching the IPs from each layer together (`step_create_stitched_ip`). At this point we have an implementation of the neural network that we can integrate within a bigger FPGA design, we can run performance measurements using simulation (`step_measure_rtlsim_performance`) and out-of-context synthesis (`step_out_of_context_synthesis`) for it.\n", "The FINN builder also provides automatic system integration for Zynq and Alveo devices, this can be invoked by running `step_synthesize_bitfile`, `step_make_pynq_driver` and `step_deployment_package`." ] }, @@ -1287,7 +1584,7 @@ "outputs": [], "source": [ "import finn.builder.build_dataflow_steps as build_dataflow_steps\n", - "print(build_dataflow_steps.step_hls_codegen.__doc__)" + "print(build_dataflow_steps.step_hw_codegen.__doc__)" ] }, { @@ -1297,7 +1594,7 @@ "metadata": {}, "outputs": [], "source": [ - "showSrc(build_dataflow_steps.step_hls_codegen)" + "showSrc(build_dataflow_steps.step_hw_codegen)" ] }, { @@ -1313,7 +1610,7 @@ "id": "3b98eb65", "metadata": {}, "source": [ - "### Examples for additional builder arguments & bitfile generation " + "### Example for additional builder arguments & bitfile generation " ] }, { @@ -1334,7 +1631,7 @@ "* A matrix multiplication\n", "* A MultiThreshold operation\n", "\n", - "When converting these nodes into HLS layers, by default the MatMul and the MultiThreshold gets converted into **one** component called Matrix-Vector-Activation Unit (MVAU). But the FINN compiler allows us to implement the activation separately. This gives an additional possibility for customization because we can adjust the folding parameters of the standalone threshold unit independently. \n", + "When converting these nodes into HW layers, by default the MatMul and the MultiThreshold gets converted into **one** component called Matrix-Vector-Activation Unit (MVAU). But the FINN compiler allows us to implement the activation separately. This gives an additional possibility for customization because we can adjust the folding parameters of the standalone threshold unit independently. \n", "\n", "If you would like to enable this feature, you can set the build argument `standalone_thresholds` to `True`. In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first." ] @@ -1365,8 +1662,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -1408,103 +1706,6 @@ "#showInNetron(build_dir+\"/output_standalone_thresholds/intermediate_models/step_generate_estimate_reports.onnx\")" ] }, - { - "cell_type": "markdown", - "id": "074d8253", - "metadata": {}, - "source": [ - "#### RTL Convolutional Input Generator" - ] - }, - { - "cell_type": "markdown", - "id": "b85e5ac7", - "metadata": {}, - "source": [ - "Recently, we have worked on the *Operator Hardening* in the FINN compiler. This means that we implement core building blocks in RTL instead of using HLS.\n", - "One of these components is already available in the FINN compiler, you can enable the RTL implementation of the ConvolutionInputGenerator (aka Sliding Window Generator) by setting the build argument `force_rtl_conv_inp_gen` to `True`.\n", - "In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first." - ] - }, - { - "cell_type": "markdown", - "id": "2a90b63f", - "metadata": {}, - "source": [ - "
\n", - "Important notice: We are actively working on the integration of RTL components in the FINN flow, the enablement like shown below might change in the future.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab0c4974", - "metadata": {}, - "outputs": [], - "source": [ - "## Build flow with additional builder arguments enabled\n", - "## force_rtl_conv_inp_gen = True\n", - "\n", - "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", - "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", - "\n", - "output_dir = build_dir + \"/output_rtl_swg\"\n", - "\n", - "#Delete previous run results if exist\n", - "if os.path.exists(output_dir):\n", - " shutil.rmtree(output_dir)\n", - " print(\"Previous run results deleted!\")\n", - "\n", - "build_steps = [\n", - " custom_step_add_pre_proc,\n", - " custom_step_add_post_proc,\n", - " \"step_qonnx_to_finn\",\n", - " \"step_tidy_up\",\n", - " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", - " \"step_create_dataflow_partition\",\n", - " \"step_target_fps_parallelization\",\n", - " \"step_apply_folding_config\",\n", - " \"step_minimize_bit_width\",\n", - " \"step_generate_estimate_reports\",\n", - "]\n", - "\n", - "cfg_estimates = build.DataflowBuildConfig(\n", - " output_dir = output_dir,\n", - " mvau_wwidth_max = 80,\n", - " target_fps = 10000,\n", - " synth_clk_period_ns = 10.0,\n", - " fpga_part = \"xc7z020clg400-1\",\n", - " force_rtl_conv_inp_gen = True,\n", - " steps = build_steps,\n", - " generate_outputs=[\n", - " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19fe4d85", - "metadata": {}, - "outputs": [], - "source": [ - "#%%time\n", - "#build.build_dataflow_cfg(model_file, cfg_estimates);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c1f1ce9", - "metadata": {}, - "outputs": [], - "source": [ - "#showInNetron(build_dir+\"/output_rtl_swg/intermediate_models/step_generate_estimate_reports.onnx\")" - ] - }, { "cell_type": "markdown", "id": "601eb5f8", @@ -1526,17 +1727,15 @@ "id": "ffa2a352", "metadata": {}, "source": [ - "For an optimized design, we download the folding configuration for cnv-w2a2 on the Pynq-Z1 board from [finn-examples](https://github.com/Xilinx/finn-examples). And will pass it to the build flow. Please also note below that we now pass the board as argument to the builder (`board = \"Pynq-Z1\"`) instead of just the fpga part. This time we will select all possible outputs to generate. Please be aware that running the full build might take a few hours." + "For an optimized design, we saved a local copy of the folding configuration for cnv-w2a2 on the Pynq-Z1 board from [finn-examples](https://github.com/Xilinx/finn-examples) in this folder. And will pass it to the build flow. Please also note below that we now pass the board as argument to the builder (`board = \"Pynq-Z1\"`) instead of just the fpga part. This time we will select all possible outputs to generate. Please be aware that running the full build might take a few hours." ] }, { - "cell_type": "code", - "execution_count": null, - "id": "765e5ee7", + "cell_type": "markdown", + "id": "8d1b041f-027c-444e-81ac-98ce9b6d1b51", "metadata": {}, - "outputs": [], "source": [ - "!wget https://raw.githubusercontent.com/Xilinx/finn-examples/main/build/bnn-pynq/folding_config/cnv-w2a2_folding_config.json" + "Note that we set one additional argument: `default_swg_exception = True`. This is done because this example is customized to fit on the Pynq-Z1 board, to optimize the resources we remove FIFOs between SWGs and MVAUs manually to avoid unnecessary buffering." ] }, { @@ -1569,14 +1768,15 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", " \"step_generate_estimate_reports\",\n", - " \"step_hls_codegen\",\n", - " \"step_hls_ipgen\",\n", + " \"step_hw_codegen\",\n", + " \"step_hw_ipgen\",\n", " \"step_set_fifo_depths\",\n", " \"step_create_stitched_ip\",\n", " \"step_measure_rtlsim_performance\",\n", @@ -1587,13 +1787,15 @@ "]\n", "\n", "cfg_build = build.DataflowBuildConfig(\n", - " output_dir = output_dir,\n", - " mvau_wwidth_max = 80,\n", - " synth_clk_period_ns = 10.0,\n", - " folding_config_file = \"cnv-w2a2_folding_config.json\",\n", - " board = \"Pynq-Z1\",\n", - " shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n", - " steps = build_steps,\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " #specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n", + " folding_config_file = \"cnv-w2a2_folding_config.json\",\n", + " board = \"Pynq-Z1\",\n", + " shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n", + " steps = build_steps,\n", + " default_swg_exception = True,\n", " generate_outputs=[\n", " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", " build_cfg.DataflowOutputType.STITCHED_IP,\n", @@ -1634,7 +1836,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/advanced/cnv-w2a2_folding_config.json b/notebooks/advanced/cnv-w2a2_folding_config.json new file mode 100644 index 0000000000..68409ff695 --- /dev/null +++ b/notebooks/advanced/cnv-w2a2_folding_config.json @@ -0,0 +1,79 @@ +{ + "Defaults": {}, + "Thresholding_hls_0": { + "PE": 1, + "ram_style": "distributed" + }, + "ConvolutionInputGenerator_rtl_0": { + "SIMD": 3, + "ram_style": "distributed" + }, + "MVAU_hls_0": { + "PE": 8, + "SIMD": 3, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_1": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_1": { + "PE": 16, + "SIMD": 16, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_2": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_2": { + "PE": 8, + "SIMD": 16, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_3": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_3": { + "PE": 8, + "SIMD": 16, + "ram_style": "block" + }, + "ConvolutionInputGenerator_rtl_4": { + "SIMD": 8, + "ram_style": "distributed" + }, + "MVAU_hls_4": { + "PE": 4, + "SIMD": 8, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_5": { + "SIMD": 8, + "ram_style": "distributed" + }, + "MVAU_hls_5": { + "PE": 1, + "SIMD": 8, + "ram_style": "auto" + }, + "MVAU_hls_6": { + "PE": 1, + "SIMD": 2, + "ram_style": "distributed" + }, + "MVAU_hls_7": { + "PE": 2, + "SIMD": 2, + "ram_style": "block" + }, + "MVAU_hls_8": { + "PE": 5, + "SIMD": 1, + "ram_style": "distributed" + }, + "LabelSelect_hls_0": { + "PE": 1 + } +} diff --git a/notebooks/advanced/cybsec_PE_SIMD.onnx b/notebooks/advanced/cybsec_PE_SIMD.onnx index b450cc9e43..8d42b2e37b 100644 Binary files a/notebooks/advanced/cybsec_PE_SIMD.onnx and b/notebooks/advanced/cybsec_PE_SIMD.onnx differ diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb index 35a83ea97b..f1b3dcf68b 100644 --- a/notebooks/basics/0_how_to_work_with_onnx.ipynb +++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb @@ -613,9 +613,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb index f15f716e7f..5c2f10310f 100644 --- a/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb +++ b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb @@ -177,7 +177,7 @@ "source": [ "## 3. Import into FINN and converting QONNX to FINN-ONNX\n", "\n", - "Similarily to the 1a notebook we will first run a cleanup transformation on the exported QONNX model." + "We will first run a cleanup transformation on the exported QONNX model." ] }, { @@ -318,9 +318,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb index 9e9d52e476..3141d54ddf 100644 --- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb @@ -46,8 +46,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n", - "There is an additional section for functional verification (red section) on the left side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", + "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n", + "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", "\n", "\n", "We will use the helper function `showInNetron` to show the ONNX model at the current transformation step. The Netron displays are interactive, but they only work when running the notebook actively and not on GitHub (i.e. if you are viewing this on GitHub you'll only see blank squares)." @@ -207,7 +207,7 @@ "\n", "![](cnv-mp-fc.png)\n", "\n", - "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n", + "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU) or sometimes called matrix-vector-activation unit (MVAU). But now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib) and/or as RTL modules in [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n", "\n", "\n", "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n", @@ -252,7 +252,7 @@ "\n", "* `Streamline` moves floating point scaling and addition operations closer to the input of the nearest thresholding activation and absorbs them into thresholds\n", "* `LowerConvsToMatMul` converts ONNX `Conv` nodes into sequences of `Im2Col, MatMul` nodes as discussed above. `Im2Col` is a custom FINN ONNX high-level node type that implements the sliding window operator.\n", - "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n", + "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib and finn-rtllib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n", "* You may recall `ConvertBipolarMatMulToXnorPopcount` from the TFC-w1a1 example, which is needed to implement bipolar-by-bipolar (w1a1) networks correctly using finn-hlslib.\n", "\n", "Let's visualize the streamlined and lowered network with Netron. Observe how all the `Conv` nodes have turned into pairs of `Im2Col, MatMul` nodes, and many nodes including `BatchNorm, Mul, Add` nodes have disappeared and replaced with `MultiThreshold` nodes." @@ -271,9 +271,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Partitioning, Conversion to HLS Layers and Folding\n", + "## 3. Partitioning, Conversion to HW Layers and Folding\n", "\n", - "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HLS equivalents and separate them out into a *dataflow partition*:\n" + "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HW equivalents, separate them out into a *dataflow partition* and specialize them to HLS variants:\n" ] }, { @@ -282,27 +282,25 @@ "metadata": {}, "outputs": [], "source": [ - "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n", + "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n", "from finn.transformation.fpgadataflow.create_dataflow_partition import (\n", " CreateDataflowPartition,\n", ")\n", "from finn.transformation.move_reshape import RemoveCNVtoFCFlatten\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "from qonnx.custom_op.registry import getCustomOp\n", "from qonnx.transformation.infer_data_layouts import InferDataLayouts\n", "\n", - "# choose the memory mode for the MVTU units, decoupled or const\n", - "mem_mode = \"decoupled\"\n", - "\n", "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_streamlined.onnx\")\n", - "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))\n", - "model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))\n", + "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n", + "model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())\n", "# TopK to LabelSelect\n", - "model = model.transform(to_hls.InferLabelSelectLayer())\n", + "model = model.transform(to_hw.InferLabelSelectLayer())\n", "# input quantization (if any) to standalone thresholding\n", - "model = model.transform(to_hls.InferThresholdingLayer())\n", - "model = model.transform(to_hls.InferConvInpGen())\n", - "model = model.transform(to_hls.InferStreamingMaxPool())\n", - "# get rid of Reshape(-1, 1) operation between hlslib nodes\n", + "model = model.transform(to_hw.InferThresholdingLayer())\n", + "model = model.transform(to_hw.InferConvInpGen())\n", + "model = model.transform(to_hw.InferStreamingMaxPool())\n", + "# get rid of Reshape(-1, 1) operation between hw nodes\n", "model = model.transform(RemoveCNVtoFCFlatten())\n", "# get rid of Tranpose -> Tranpose identity seq\n", "model = model.transform(absorb.AbsorbConsecutiveTransposes())\n", @@ -314,7 +312,9 @@ "sdp_node = getCustomOp(sdp_node)\n", "dataflow_model_filename = sdp_node.get_nodeattr(\"model\")\n", "# save the dataflow partition with a different name for easier access\n", + "# and specialize the layers to HLS variants\n", "dataflow_model = ModelWrapper(dataflow_model_filename)\n", + "dataflow_model = dataflow_model.transform(SpecializeLayers())\n", "dataflow_model.save(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")" ] }, @@ -322,7 +322,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations in hlslib. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*" + "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*" ] }, { @@ -364,7 +364,7 @@ "outputs": [], "source": [ "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")\n", - "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n", "folding = [\n", " (16, 3, [128]),\n", @@ -384,7 +384,7 @@ " fcl_inst.set_nodeattr(\"inFIFODepths\", ififodepth)\n", "\n", "# use same SIMD values for the sliding window operators\n", - "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator\")\n", + "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator_rtl\")\n", "for i in range(len(swg_layers)):\n", " swg_inst = getCustomOp(swg_layers[i])\n", " simd = folding[i][1]\n", @@ -398,7 +398,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Below we visualize in Netron to observe the `StreamingDataWidthConverter` and `StreamingFIFO` nodes that have been inserted into graph, as well as the folding factors in the `PE` and `SIMD` attributes of each `MatrixVectorActivation`." + "Below we visualize in Netron to observe the folding factors in the `PE` and `SIMD` attributes of each `MVAU_hls`." ] }, { diff --git a/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg index fa36be96c5..561770f2da 100755 --- a/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg +++ b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg @@ -1 +1 @@ - + diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb index a5c97328a5..bbaa74dbff 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n", + "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n", "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", "\n", "\n", @@ -114,7 +114,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN. The model was exported in QONNX format, to feed it into the FINN flow, our first step is to convert it to the FINN-ONNX format." + "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. `ModelWrapper` is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN. The model was exported in QONNX format, to feed it into the FINN flow, our first step is to convert it to the FINN-ONNX format." ] }, { @@ -129,6 +129,23 @@ "model = model.transform(ConvertQONNXtoFINN())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the conversion we save the model and visualize it using Netron. As you can see, quantization is now expressed differently. Where we had Quant nodes before, there are now MultiThreshold nodes present in the graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.save(build_dir+\"/tfc_w1_a1_finn.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_finn.onnx\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -149,8 +166,9 @@ "* [FINN-style Dataflow Architectures](#dataflow_arch)\n", "* [Tidy-up transformations](#basic_trafo)\n", "* [Streamlining](#streamline)\n", - "* [Conversion to HLS layers](#hls_layers)\n", + "* [Conversion to HW layers](#hw_layers)\n", "* [Creating a Dataflow Partition](#dataflow_partition)\n", + "* [Specialize layers](#specialize_layers)\n", "* [Folding and Datawidth Converter, FIFO and TLastMarker Insertion](#folding)\n", "\n", "\n", @@ -167,7 +185,7 @@ "\n", "![](finn-hw-arch.png)\n", "\n", - "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process." + "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by Verilog modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib). As these function calls/modules can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls/modules, which is the goal of the network preparation process." ] }, { @@ -254,7 +272,7 @@ "\n", "In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n", "\n", - "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L86), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use." + "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L93), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use." ] }, { @@ -407,32 +425,25 @@ "model = model.transform(InferDataLayouts())\n", "model = model.transform(RemoveUnusedTensors())\n", "\n", - "model.save(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n", - "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")" + "model.save(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to HLS layers." + "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to hardware (HW) layers." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Conversion to HLS layers \n", - "Converts the nodes to HLS layers that correspond to the functions in [finn-hls library](https://finn-hlslib.readthedocs.io/en/latest/). In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MatrixVectorActivation layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.\n", + "### Conversion to HW layers \n", + "Converts the nodes to HW layers, these layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow. In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MVAU layers (matrix vector activation unit). Any immediately following MultiThreshold layers will also be absorbed into the MVAU.\n", "\n", - "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MatrixVectorActivation` nodes, which will correspond to a function call from the [finn-hlslib](https://finn-hlslib.readthedocs.io/en/latest/library/matrixvector.html) library." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note:** The transformation `to_hls.InferBinaryMatrixVectorActivation` gets the string \"decoupled\" as argument, this indicates the `mem_mode` for the weights. In FINN there are different options to set the way the weights are stored and accessed. For details please have a look on the [FINN readthedocs website](https://finn.readthedocs.io/) under Internals." + "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MVAU` nodes." ] }, { @@ -441,22 +452,15 @@ "metadata": {}, "outputs": [], "source": [ - "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n", - "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n", - "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(\"decoupled\"))\n", + "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n", + "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n", + "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n", "# TopK to LabelSelect\n", - "model = model.transform(to_hls.InferLabelSelectLayer())\n", + "model = model.transform(to_hw.InferLabelSelectLayer())\n", "# input quantization (if any) to standalone thresholding\n", - "model = model.transform(to_hls.InferThresholdingLayer())\n", - "model.save(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n", - "showInNetron(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Each MatrixVectorActivation node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. We will shortly cover how these can be adjusted, but first we want to separate the HLS layers from the non-HLS layers in this network." + "model = model.transform(to_hw.InferThresholdingLayer())\n", + "model.save(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")" ] }, { @@ -465,7 +469,7 @@ "source": [ "### Creating a Dataflow Partition \n", "\n", - "In the graph above, you can see that there is a mixture of FINN HLS layers (MatrixVectorActivation and Thresholding_Batch) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition." + "In the graph above, you can see that there is a mixture of FINN HW layers (`MVAU` and `Thresholding`) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HW layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition." ] }, { @@ -476,7 +480,7 @@ "source": [ "from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition\n", "\n", - "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n", + "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n", "parent_model = model.transform(CreateDataflowPartition())\n", "parent_model.save(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")\n", "showInNetron(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")" @@ -486,7 +490,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that the `MatrixVectorActivation` instances and the `Thresholding_Batch` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:" + "We can see that the `MVAU` instances and the `Thresholding` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HW dataflow-only graph:" ] }, { @@ -506,7 +510,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see all the extracted `MatrixVectorActivation` instances and the `Thresholding_Batch` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it." + "We can see all the extracted `MVAU` instances and the `Thresholding` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it." ] }, { @@ -518,6 +522,60 @@ "model = ModelWrapper(dataflow_model_filename)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specialize layers " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the `SpecializeLayers` transformation. It is possible to let the FINN flow know a preference for the implementation style `{\"hls\", \"rtl\"}` and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default. In the tfc example, we will set all layers to their HLS variants. To showcase how to set the preferred implementation, we will set the node attribute in the `Thresholding` layer to `\"hls\"`, for the `MVAUs` and the `LabelSelect` we will leave this node attribute empty and in this case by default it will be set to HLS." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "thresh_node = model.get_nodes_by_op_type(\"Thresholding\")[0]\n", + "thresh_node_inst = getCustomOp(thresh_node)\n", + "thresh_node_inst.set_nodeattr(\"preferred_impl_style\", \"hls\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we will call `SpecializeLayers` to convert each HW abstraction layer to (in this case) an HLS variant." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", + "model = model.transform(SpecializeLayers())\n", + "\n", + "model.save(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each node type has now a suffix (`_hls`) and the module (`\n", + "finn.custom_op.fpgadataflow.hls` also indicates that that the HLS variant of the layer is selected.\n", + "We can now proceed by adjusting the parallelism of each node to customize the performance and resource usage.)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -526,14 +584,17 @@ "\n", "*Folding* in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several *folding factors* for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original [FINN paper](https://arxiv.org/pdf/1612.07119). The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume. \n", "\n", - "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a MatrixVectorActivation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four MatrixVectorActivation. So as an example we extract the second node of the graph." + "Each MVAU_hls node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. \n", + "\n", + "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a Matrix-Vector-Activation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four `MVAUs`. So as an example we extract the second node of the graph." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/hlscustomop.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes." + "We can use the higher-level CustomOp wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Above, we have already used this abstraction to set the node attribute of the Thresholding HW layer.\n", + "Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes." ] }, { @@ -564,7 +625,7 @@ "metadata": {}, "outputs": [], "source": [ - "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n", "config = [\n", " (16, 49, [16], [64], \"block\"),\n", @@ -581,7 +642,7 @@ " fcl_inst.set_nodeattr(\"ram_style\", ramstyle)\n", " \n", "# set parallelism for input quantizer to be same as first layer's SIMD\n", - "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_Batch\")[0]\n", + "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_hls\")[0]\n", "inp_qnt = getCustomOp(inp_qnt_node)\n", "inp_qnt.set_nodeattr(\"PE\", 49)" ] @@ -658,7 +719,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In previous versions of FINN, we had to manually go through several steps to generate HLS code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**" + "In previous versions of FINN, we had to manually go through several steps to generate HLS/RTL code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**" ] }, { @@ -740,7 +801,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:" + "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Both layer types are inserted as RTL variants. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:" ] }, { @@ -1014,9 +1075,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb index 2f6cde6e5b..a07a8d2254 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb @@ -7,16 +7,16 @@ "# FINN - Functional Verification of End-to-End Flow\n", "-----------------------------------------------------------------\n", "\n", - "**Important: This notebook depends on the tfc_end2end_example notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n", + "**Important: This notebook depends on the [tfc_end2end_example](tfc_end2end_example.ipynb) notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n", "\n", - "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. Besides the methods in this notebook, there is another one that is covered in the Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb): remote execution. The remote execution allows functional verification directly on the PYNQ board, for details please have a look at the mentioned Jupyter notebook." + "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\"Drawing\"" + "\"Drawing\"" ] }, { @@ -72,9 +72,9 @@ "source": [ "## Simulation using Python \n", "\n", - "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n", + "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow.hls` or `backend` $\\neq$ `fpgadataflow.rtl`) this model can be checked for functionality using Python.\n", "\n", - "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n" + "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of an XNOR popcount node.\n" ] }, { @@ -95,7 +95,7 @@ "\n", "This execution function and onnxruntime is used when `execute_onnx` from `onnx_exec` is applied to the model. The model is then simulated node by node and the result is stored in a context dictionary, which contains the values of each tensor at the end of the execution. To get the result, only the output tensor has to be extracted.\n", "\n", - "The procedure is shown below. We take the model right before the nodes should be converted into HLS layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs." + "The procedure is shown below. We take the model right before the nodes should be converted into HW layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs." ] }, { @@ -108,7 +108,7 @@ "from qonnx.core.modelwrapper import ModelWrapper\n", "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n", "\n", - "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")" + "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")" ] }, { @@ -141,7 +141,16 @@ "source": [ "## Simulation (cppsim) using C++\n", "\n", - "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model." + "When dealing with HLS or RTL custom op nodes in FINN the simulation using Python is no longer sufficient. If the nodes are specialized to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding `finn-hlslib` function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS variants of the layers, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Note: HW layer can also be converted to RTL variants, in this case \"cppsim\" is not an option we can execute. If nevertheless \"cppsim\" is selected as execution mode for the layer, the execution defaults to the parent class. Like this, networks with a mix of HLS and RTL layers can be executed using \"cppsim\" for the HLS layers. \n", + "
" ] }, { @@ -158,7 +167,7 @@ "metadata": {}, "source": [ "To generate the code for this simulation and to generate the executable two transformations are used:\n", - "* `PrepareCppSim` which generates the C++ code for the corresponding hls layer\n", + "* `PrepareCppSim` which generates the C++ code for the corresponding HLS layer\n", "* `CompileCppSim` which compules the C++ code and stores the path to the executable" ] }, @@ -280,9 +289,9 @@ "source": [ "## Emulation (rtlsim) using PyVerilator\n", "\n", - "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n", + "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers or for RTL layers directly using the generated Verilog files. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n", "\n", - "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS nodes could also be executed as whole." + "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS/RTL nodes could also be executed as whole." ] }, { @@ -380,18 +389,14 @@ "source": [ "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n", "from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP\n", "\n", "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n", - "child_model = child_model.transform(InsertDWC())\n", - "\n", - "# set all impl_styles of the DWCs to hls to enable emulation\n", - "dwc_nodes = child_model.get_nodes_by_op_type(\"StreamingDataWidthConverter_Batch\")\n", - "for dwc in dwc_nodes:\n", - " dwc_inst = getCustomOp(dwc)\n", - " dwc_inst.set_nodeattr(\"impl_style\", \"hls\")\n", - " \n", + "child_model = child_model.transform(InsertDWC()) \n", "child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n", + "# DWC and FIFOs need to be specialized to either HLS or RTL variants\n", + "child_model = child_model.transform(SpecializeLayers())\n", "child_model.save(build_dir + \"/test.onnx\");\n", "child_model = child_model.transform(GiveUniqueNodeNames())\n", "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n", @@ -455,7 +460,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/bnn-pynq/verification.png b/notebooks/end2end_example/bnn-pynq/verification.png deleted file mode 100755 index cb50ba1b67..0000000000 Binary files a/notebooks/end2end_example/bnn-pynq/verification.png and /dev/null differ diff --git a/notebooks/end2end_example/bnn-pynq/verification.svg b/notebooks/end2end_example/bnn-pynq/verification.svg new file mode 100755 index 0000000000..9cf8e86088 --- /dev/null +++ b/notebooks/end2end_example/bnn-pynq/verification.svg @@ -0,0 +1 @@ + diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb index 7644173284..da037050bb 100644 --- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb +++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb @@ -769,7 +769,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb index a5bc165573..33b64e11c0 100644 --- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb +++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb @@ -399,7 +399,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb index 80f3cd3819..73cd25cf20 100644 --- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb +++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb @@ -265,7 +265,7 @@ "\n", "**Live FINN tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on **(your AWS URL):6080/vnc.html**\n", "\n", - "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MatrixVectorActivation_XXXXXX`\n", + "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MVAU_hls_XXXXXX`\n", " \n", "* Once the `step_create_stitched_ip [11/16]` below is completed, you can view the generated stitched IP in Vivado under `/home/ubuntu/finn/notebooks/end2end_example/cybersecurity/output_ipstitch_ooc_rtlsim/stitched_ip`\n", " " @@ -659,7 +659,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt index e03eff2c98..c2973f9432 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ ipython==8.12.2 numpy==1.24.1 onnx==1.13.0 onnxoptimizer -onnxruntime==1.15.0 +onnxruntime==1.16.1 pre-commit==3.3.2 protobuf==3.20.3 psutil==5.9.4 diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py index 824690f5f6..a4bf40760e 100644 --- a/src/finn/analysis/fpgadataflow/dataflow_performance.py +++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,7 +29,7 @@ from qonnx.custom_op.registry import getCustomOp -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def dataflow_performance(model): @@ -38,7 +39,7 @@ def dataflow_performance(model): for each node along the critical path. Preconditions: - - model consists of fpgadataflow nodes + - model consists of HLS/RTL nodes - model has cycle estimates annotated (see AnnotateCycles transformation) - nodes have unique names (see GiveUniqueNodeNames) @@ -52,7 +53,7 @@ def dataflow_performance(model): max_node_name = "" for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = getCustomOp(node) node_cycles = int(inst.get_nodeattr("cycles_estimate")) if node_cycles > max_cycles: diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py index e1517ec636..50585720fe 100644 --- a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py +++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,7 +29,7 @@ import qonnx.custom_op.registry as registry -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def exp_cycles_per_layer(model): @@ -41,7 +42,7 @@ def exp_cycles_per_layer(model): cycle_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) cycle_dict[node.name] = int(inst.get_exp_cycles()) diff --git a/src/finn/analysis/fpgadataflow/floorplan_params.py b/src/finn/analysis/fpgadataflow/floorplan_params.py index d57b660bce..be03966fb9 100644 --- a/src/finn/analysis/fpgadataflow/floorplan_params.py +++ b/src/finn/analysis/fpgadataflow/floorplan_params.py @@ -1,4 +1,5 @@ # Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ def floorplan_params(model): } } for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): node_inst = getCustomOp(node) node_slr = node_inst.get_nodeattr("slr") node_pid = node_inst.get_nodeattr("partition_id") diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py index 4d921438f6..330494315a 100644 --- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py +++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py @@ -30,11 +30,12 @@ import warnings import xml.etree.ElementTree as ET -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node def hls_synth_res_estimation(model): - """Extracts the FPGA resource results from the Vivado HLS synthesis estimates. + """Extracts the FPGA resource results from the Vitis HLS synthesis estimates. + Note that this analysis pass only works on nodes that have an HLS backend. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. @@ -43,7 +44,7 @@ def hls_synth_res_estimation(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): # init values to zero res_dict[node.name] = dict() res_dict[node.name]["BRAM_18K"] = 0 diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py index 3304b88d60..7b65b60fa7 100644 --- a/src/finn/analysis/fpgadataflow/post_synth_res.py +++ b/src/finn/analysis/fpgadataflow/post_synth_res.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,7 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def post_synth_res(model, override_synth_report_filename=None): @@ -102,7 +103,7 @@ def get_instance_stats(inst_name): sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model")) sdp_res_dict = post_synth_res(sdp_model, synth_report_filename) res_dict.update(sdp_res_dict) - elif _is_fpgadataflow_node(node): + elif is_hls_node(node) or is_rtl_node(node): node_dict = get_instance_stats(node.name) if node_dict is not None: res_dict[node.name] = node_dict diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index be4cf417bc..a6be1f1f53 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -28,7 +28,7 @@ import qonnx.custom_op.registry as registry -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def res_estimation(model): @@ -41,7 +41,7 @@ def res_estimation(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) res_dict[node.name] = inst.node_res_estimation() @@ -59,10 +59,10 @@ def res_estimation_complete(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: - op_type = node.op_type + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) - if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation": + op_type = node.op_type + if op_type.startswith("MVAU") or op_type.startswith("VVAU"): orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] inst.set_nodeattr("resType", "dsp") @@ -70,7 +70,7 @@ def res_estimation_complete(model): inst.set_nodeattr("resType", "lut") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("resType", orig_restype) - elif op_type == "ConvolutionInputGenerator": + elif op_type.startswith("ConvolutionInputGenerator"): orig_ramstyle = inst.get_nodeattr("ram_style") res_dict[node.name] = [] inst.set_nodeattr("ram_style", "block") diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index e4fed05731..e35c1cd346 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -64,15 +65,6 @@ class DataflowOutputType(str, Enum): DEPLOYMENT_PACKAGE = "deployment_package" -class ComputeEngineMemMode(str, Enum): - """Memory mode for generated compute engines. See - https://finn.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode - for more information.""" - - CONST = "const" - DECOUPLED = "decoupled" - - class VitisOptStrategyCfg(str, Enum): """Vitis optimization strategy with serializable string enum values.""" @@ -115,14 +107,15 @@ class VerificationStepType(str, Enum): "step_qonnx_to_finn", "step_tidy_up", "step_streamline", - "step_convert_to_hls", + "step_convert_to_hw", "step_create_dataflow_partition", + "step_specialize_layers", "step_target_fps_parallelization", "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", - "step_hls_codegen", - "step_hls_ipgen", + "step_hw_codegen", + "step_hw_ipgen", "step_set_fifo_depths", "step_create_stitched_ip", "step_measure_rtlsim_performance", @@ -137,17 +130,18 @@ class VerificationStepType(str, Enum): "step_qonnx_to_finn", "step_tidy_up", "step_streamline", - "step_convert_to_hls", + "step_convert_to_hw", "step_create_dataflow_partition", + "step_specialize_layers", "step_target_fps_parallelization", "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", ] -#: List of steps to run for a dataflow build including HLS code generation, but +#: List of steps to run for a dataflow build including HW code generation, but #: without any synthesis. -hls_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hls_codegen"] +hw_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hw_codegen"] @dataclass_json @@ -170,6 +164,14 @@ class DataflowBuildConfig: #: DataflowOutputType for available options. generate_outputs: List[DataflowOutputType] + #: (Optional) Path to configuration JSON file in which user can specify + #: a preferred implementation style (HLS or RTL) for each node. + #: The SpecializeLayers transformation picks up these settings and if possible + #: fulfills the desired implementation style for each layer by converting the + #: node into its HLS or RTL variant. + #: Will be applied with :py:mod:`qonnx.transformation.general.ApplyConfig` + specialize_layers_config_file: Optional[str] = None + #: (Optional) Path to configuration JSON file. May include parallelization, #: FIFO sizes, RAM and implementation style attributes and so on. #: If the parallelization attributes (PE, SIMD) are part of the config, @@ -230,7 +232,7 @@ class DataflowBuildConfig: mvau_wwidth_max: Optional[int] = 36 #: (Optional) Whether thresholding layers (which implement quantized - #: activations in FINN) will be implemented as stand-alone HLS layers, + #: activations in FINN) will be implemented as stand-alone HW layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. standalone_thresholds: Optional[bool] = False @@ -277,17 +279,14 @@ class DataflowBuildConfig: #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO - #: Target clock frequency (in nanoseconds) for Vivado HLS synthesis. + #: Target clock frequency (in nanoseconds) for Vitis HLS synthesis. #: e.g. `hls_clk_period_ns=5.0` will target a 200 MHz clock. #: If not specified it will default to synth_clk_period_ns hls_clk_period_ns: Optional[float] = None - #: Which memory mode will be used for compute layers - default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED - - #: Force inference of RTL ConvolutionInputGenerator over HLS implementation - #: If set to False, falls back to the default behavior of InferConvInpGen() - force_rtl_conv_inp_gen: Optional[bool] = False + #: Call CapConvolutionFIFODepths in InsertAndSetFIFODepths transform + #: to make convolution FIFOs smaller where appropriate + default_swg_exception: Optional[bool] = False #: Which Vitis platform will be used. #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO` @@ -347,8 +346,8 @@ class DataflowBuildConfig: #: Override the number of inputs for rtlsim performance measurement. rtlsim_batch_size: Optional[int] = 1 - #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during - #: rtlsim, otherwise they will be replaced by HLS implementations. + #: If set to True, FIFOs with impl_style=vivado will be kept during + #: rtlsim, otherwise they will be replaced by RTL implementations. rtlsim_use_vivado_comps: Optional[bool] = True def _resolve_hls_clk_period(self): diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 54ba7e4ea1..443d2df54c 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -52,7 +53,7 @@ from qonnx.util.config import extract_model_config_to_json from shutil import copy -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -108,6 +109,7 @@ SplitLargeFIFOs, ) from finn.transformation.fpgadataflow.set_folding import SetFolding +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext from finn.transformation.fpgadataflow.vitis_build import VitisBuild from finn.transformation.move_reshape import RemoveCNVtoFCFlatten @@ -216,23 +218,15 @@ def verify_step( def prepare_for_stitched_ip_rtlsim(verify_model, cfg): if not cfg.rtlsim_use_vivado_comps: need_restitch = False - # switch impl_style=vivado components to rtl/hls + # switch impl_style=vivado components to rtl # StreamingFIFO must have impl_style=rtl - for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): + for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO_rtl"): inst = getCustomOp(fifo_layer) if inst.get_nodeattr("impl_style") != "rtl": inst.set_nodeattr("impl_style", "rtl") inst.set_nodeattr("code_gen_dir_ipgen", "") inst.set_nodeattr("ipgen_path", "") need_restitch = True - # StreamingDataWidthConverter must have impl_style=hls - for dwc_layer in verify_model.get_nodes_by_op_type("StreamingDataWidthConverter_Batch"): - inst = getCustomOp(dwc_layer) - if inst.get_nodeattr("impl_style") != "hls": - inst.set_nodeattr("impl_style", "hls") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True # if we've made alterations to the model, need to do some re-prep if need_restitch: print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") @@ -336,43 +330,42 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): return model -def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert eligible nodes to `HLSCustomOp` subclasses that represent HLS - layers. Which nodes and particular configurations can be converted to HLS - is limited, see the source code of the `convert_to_hls` module for more.""" +def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert eligible nodes to `HWCustomOp` subclasses that represent HW + layers. Which nodes and particular configurations can be converted to HW + is limited, see the source code of the `convert_to_hw` module for more. + In the end am empty json file is created which can be used to set user specific + preferred implementation styles for each node.""" - mem_mode = cfg.default_mem_mode.value if cfg.standalone_thresholds: # doing this first causes all threshold layers to be standalone - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) # needed for non-bipolar MatMul layers - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) # TopK to LabelSelect - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) as standalone threshold - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for convolutions -- TODO always exec? need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0 if need_conv: - if cfg.force_rtl_conv_inp_gen: - model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) - else: - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataLayouts()) + return model def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig): - """Separate consecutive groups of HLSCustomOp nodes into StreamingDataflowPartition + """Separate consecutive groups of HWCustomOp nodes into StreamingDataflowPartition nodes, which point to a separate ONNX file. Dataflow accelerator synthesis - can only be performed on those HLSCustomOp sub-graphs.""" + can only be performed on those HWCustomOp sub-graphs.""" parent_model = model.transform( CreateDataflowPartition( @@ -387,6 +380,31 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig if cfg.save_intermediate_models: parent_model.save(cfg.output_dir + "/intermediate_models/dataflow_parent.onnx") model = ModelWrapper(dataflow_model_filename) + + # create a configuration json file that can be used to set the specialize layer config + attrs = [ + "preferred_impl_style", + ] + extract_model_config_to_json( + model, cfg.output_dir + "/template_specialize_layers_config.json", attrs + ) + + return model + + +def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert HW nodes to either an HLS or RTL variant of the node. HW nodes + get converted either based on pre-determined rules (details can be found + in `specialize_layers` source code) or the user provides a configuration file + which contains the desired setting. If the user preference cannot be fulfilled, + a warning will be printed and the implementation style will be set to a default.""" + + if cfg.specialize_layers_config_file is not None: + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(ApplyConfig(cfg.specialize_layers_config_file)) + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) return model @@ -414,6 +432,8 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi "resType", "mem_mode", "runtime_writeable_weights", + "depth_trigger_uram", + "depth_trigger_bram", ] extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs) @@ -482,16 +502,17 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): return model -def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): - "Generate Vivado HLS code to prepare HLSCustomOp nodes for IP generation." +def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): + """Generate Vitis HLS code to prepare HLSBackend nodes for IP generation. + And fills RTL templates for RTLBackend nodes.""" model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) return model -def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): - """Run Vivado HLS synthesis on generated code for HLSCustomOp nodes, - in order to generate IP blocks.""" +def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): + """Run Vitis HLS synthesis on generated code for HLSBackend nodes, + in order to generate IP blocks. For RTL nodes this step does not do anything.""" model = model.transform(HLSSynthIP()) model = model.transform(ReplaceVerilogRelPaths()) @@ -519,6 +540,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.auto_fifo_depths: if cfg.auto_fifo_strategy == "characterize": model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform( PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) @@ -536,6 +558,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): create_shallow_fifos=True, ) ) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) elif cfg.auto_fifo_strategy == "largefifo_rtlsim": @@ -551,6 +574,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): InsertAndSetFIFODepths( cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), + swg_exception=cfg.default_swg_exception, vivado_ram_style=cfg.large_fifo_mem_style, force_python_sim=force_python_sim, ) @@ -566,6 +590,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # need to make sure all FIFOs are created so that their depth can be # set by ApplyConfig, so create_shallow_fifos=True model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: @@ -584,6 +609,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): "runtime_writeable_weights", "inFIFODepths", "outFIFODepths", + "depth_trigger_uram", + "depth_trigger_bram", ] extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs) @@ -823,14 +850,15 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_qonnx_to_finn": step_qonnx_to_finn, "step_tidy_up": step_tidy_up, "step_streamline": step_streamline, - "step_convert_to_hls": step_convert_to_hls, + "step_convert_to_hw": step_convert_to_hw, + "step_specialize_layers": step_specialize_layers, "step_create_dataflow_partition": step_create_dataflow_partition, "step_target_fps_parallelization": step_target_fps_parallelization, "step_apply_folding_config": step_apply_folding_config, "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, - "step_hls_codegen": step_hls_codegen, - "step_hls_ipgen": step_hls_ipgen, + "step_hw_codegen": step_hw_codegen, + "step_hw_ipgen": step_hw_ipgen, "step_set_fifo_depths": step_set_fifo_depths, "step_create_stitched_ip": step_create_stitched_ip, "step_measure_rtlsim_performance": step_measure_rtlsim_performance, diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index d6c0794b00..aed2ab7fe1 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,76 +27,57 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch -from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch -from finn.custom_op.fpgadataflow.checksum import CheckSum +from finn.custom_op.fpgadataflow.addstreams import AddStreams +from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp from finn.custom_op.fpgadataflow.concat import StreamingConcat from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( ConvolutionInputGenerator, ) -from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( - ConvolutionInputGenerator1D, -) -from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import ( - ConvolutionInputGenerator_rtl, -) from finn.custom_op.fpgadataflow.downsampler import DownSampler -from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch -from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise -from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch +from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams +from finn.custom_op.fpgadataflow.fmpadding import FMPadding from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel -from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl -from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch -from finn.custom_op.fpgadataflow.iodma import IODMA -from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch +from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool +from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.custom_op.fpgadataflow.lookup import Lookup -from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation -from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.custom_op.fpgadataflow.pool import Pool from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, ) -from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( - StreamingDataWidthConverter_Batch, -) -from finn.custom_op.fpgadataflow.streamingdatawidthconverter_rtl import ( - StreamingDataWidthConverter_rtl, +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, ) +from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO -from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch -from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch -from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker -from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch -from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool +from finn.custom_op.fpgadataflow.thresholding import Thresholding +from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU custom_op = dict() # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure -custom_op["DownSampler"] = DownSampler -custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch -custom_op["MatrixVectorActivation"] = MatrixVectorActivation -custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator -custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D -custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl -custom_op["TLastMarker"] = TLastMarker -custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch -custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl +custom_op["MVAU"] = MVAU custom_op["StreamingFIFO"] = StreamingFIFO -custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch -custom_op["Pool_Batch"] = Pool_Batch -custom_op["FMPadding_Batch"] = FMPadding_Batch -custom_op["FMPadding_Pixel"] = FMPadding_Pixel -custom_op["Thresholding_Batch"] = Thresholding_Batch -custom_op["AddStreams_Batch"] = AddStreams_Batch -custom_op["LabelSelect_Batch"] = LabelSelect_Batch -custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch -custom_op["VectorVectorActivation"] = VectorVectorActivation -custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch -custom_op["IODMA"] = IODMA +custom_op["Thresholding"] = Thresholding +custom_op["VVAU"] = VVAU custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition -custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch + +custom_op["AddStreams"] = AddStreams +custom_op["ChannelwiseOp"] = ChannelwiseOp +custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator +custom_op["DownSampler"] = DownSampler +custom_op["DuplicateStreams"] = DuplicateStreams +custom_op["FMPadding"] = FMPadding +custom_op["FMPadding_Pixel"] = FMPadding_Pixel +custom_op["GlobalAccPool"] = GlobalAccPool +custom_op["LabelSelect"] = LabelSelect custom_op["Lookup"] = Lookup +custom_op["Pool"] = Pool custom_op["StreamingConcat"] = StreamingConcat -custom_op["CheckSum"] = CheckSum +custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter custom_op["StreamingEltwise"] = StreamingEltwise -custom_op["FMPadding_rtl"] = FMPadding_rtl +custom_op["StreamingMaxPool"] = StreamingMaxPool +custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py new file mode 100644 index 0000000000..ac61786ac1 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/addstreams.py @@ -0,0 +1,171 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class AddStreams(HWCustomOp): + """Abstraction layer for HW implementation of AddStreams.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) + return my_attrs + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + # we need to set output datatype to the next larger int or uint + # enhancement: consider specifying w/ explicit outputDataType attribute + # to allow overflow and use the same idt if user wants + idt = DataType[self.get_nodeattr("inputDataType")] + if idt.signed(): + return DataType.get_smallest_possible(2 * idt.min()) + else: + return DataType.get_smallest_possible(2 * idt.max()) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # simulate behavior using Python + node = self.onnx_node + inp0_values = context[node.input[0]] + inp1_values = context[node.input[1]] + oshape = context[node.output[0]].shape + ishape0 = inp0_values.shape + ishape1 = inp1_values.shape + assert ishape0 == ishape1, "Shapes of inputs should be the same for Addstreams" + result = inp0_values + inp1_values + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + "in1": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py new file mode 100644 index 0000000000..9bf4ebdf62 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py @@ -0,0 +1,234 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import onnxruntime as rt +import warnings +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# ONNX i/o tensor shape assumptions for channelwise ops: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +def get_smallest_possible(vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals, dtype=np.float64) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.get_accumulator_dt_cands(): + dt = DataType[k] + + if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType["UINT64"] + else: + return DataType["INT64"] + + +class ChannelwiseOp(HWCustomOp): + """Abstraction layer for HW implementation of ChannelwiseOp.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # channelwise "map" function to apply: + # one of cmp_le, cmp_ge, add, mul + "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}), + "PE": ("i", True, 0), + "NumChannels": ("i", True, 0), + # string defining memory resource type for parameters + "ram_style": ("s", False, "distributed", {"distributed", "block"}), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "paramDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM, the depth of the memory used + to store the channelwise op parameters.""" + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return chn // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + # implement tensor with correct shape + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt = model.get_tensor_datatype(node.input[0]) + + exp_idt_name = self.get_nodeattr("inputDataType") + if exp_idt_name != idt.name: + func = self.get_nodeattr("Func") + assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer" + + self.set_nodeattr("inputDataType", idt.name) + # update the func in ['add','mul'] cases + + # get parameter ranges + param = model.get_initializer(node.input[1]) + param_min = min(param.flatten()) + param_max = max(param.flatten()) + + # set function and determine output data type + if func == "add": + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = get_smallest_possible([out_min, out_max]) + elif func == "mul": + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = get_smallest_possible(possible_limits) + + self.set_nodeattr("outputDataType", odt.name) + + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # create a standard onnx node to help calculate the result + # depending on Func node attribute either a Mul or an Add node + node = self.onnx_node + func = self.get_nodeattr("Func") + inp_values = context[node.input[0]] + param_values = context[node.input[1]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + pshape = param_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + param = helper.make_tensor_value_info(node.input[1], TensorProto.FLOAT, pshape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_func = helper.make_node( + func.capitalize(), + inputs=node.input, + outputs=[node.output[0]], + ) + graph_func = helper.make_graph( + nodes=[node_func], + name="single-add-exec", + inputs=[inp, param], + outputs=[outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_func = qonnx_make_model(graph_func, **onnx_kwargs) + idict = {node.input[0]: inp_values, node.input[1]: param_values} + sess = rt.InferenceSession(model_func.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py index 8c24dadbeb..210b6b7fdd 100644 --- a/src/finn/custom_op/fpgadataflow/concat.py +++ b/src/finn/custom_op/fpgadataflow/concat.py @@ -1,4 +1,5 @@ # Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,16 +28,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class StreamingConcat(HLSCustomOp): - """Streaming concatenation node with dynamically generated HLS. +class StreamingConcat(HWCustomOp): + """Abstraction layer for HW implementation of Concat. Only supports concatenating along the last axis.""" def __init__(self, onnx_node, **kwargs): @@ -127,251 +126,13 @@ def get_number_output_values(self): def get_exp_cycles(self): return np.prod(self.get_folded_output_shape()[:-1]) - def generate_params(self, model, path): - elems_per_stream = self.get_nodeattr("ElemsPerStream") - inp_streams = [] - commands = [] - idt = self.get_input_datatype() - total_elems = self.get_total_elems() - total_bw = idt.bitwidth() * total_elems - for i, elems in enumerate(elems_per_stream): - bw = idt.bitwidth() * elems - inp_stream = "hls::stream > &in%d" % (bw, i) - inp_streams.append(inp_stream) - cmd = "in%d.read()" % i - commands.append(cmd) - out_stream = "hls::stream > &out" % (total_bw) - inp_streams.append(out_stream) - - impl_hls_code = [] - impl_hls_code.append("void StreamingConcat(") - impl_hls_code.append(",".join(inp_streams)) - impl_hls_code.append(", unsigned int numReps) {") - impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {") - impl_hls_code.append("#pragma HLS PIPELINE II=1") - impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw) - # FIXME: the order of streams for concatenation works out differently - # for cppsim vs rtlsim, addressed via reversing the order of commands - # for now - impl_hls_code.append("#ifdef __SYNTHESIS__") - impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");") - impl_hls_code.append("#else") - impl_hls_code.append("out_elem = (" + ",".join(commands) + ");") - impl_hls_code.append("#endif") - impl_hls_code.append("out.write(out_elem);") - impl_hls_code.append("}") - impl_hls_code.append("}") - impl_hls_code = "\n".join(impl_hls_code) - - impl_filename = "{}/concat_impl.hpp".format(path) - f_impl = open(impl_filename, "w") - f_impl.write(impl_hls_code) - f_impl.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") node = self.onnx_node - n_inps = len(self.onnx_node.input) - ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)] - folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)] - exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() - export_idt = self.get_input_datatype() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - for i in range(n_inps): - inp = context[node.input[i]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i] - # reshape input into folded form - inp = inp.reshape(folded_ishapes[i]) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - io_dict = {"inputs": {}, "outputs": {"out": []}} - for i in range(n_inps): - nbits = self.get_instream_width(i) - rtlsim_inp = npy_to_rtlsim_input( - "%s/input_%d.npy" % (code_gen_dir, i), - export_idt, - nbits, - reverse_inner=True, - ) - io_dict["inputs"]["in%d" % i] = rtlsim_inp - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - self.rtlsim_multi_io(sim, io_dict) - rtlsim_output = io_dict["outputs"]["out"] - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - reverse_inner=True, - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"'] - - def defines(self, var): - num_reps = self.get_nodeattr("numInputVectors") - num_reps = np.prod(num_reps) - self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps] - - def read_npy_data(self): - n_inputs = self.get_n_inputs() - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - npy_type = "float" - self.code_gen_dict["$READNPYDATA$"] = [] - idt = self.get_input_datatype() - idt_bw = idt.bitwidth() - elem_hls_type = idt.get_hls_datatype_str() - elem_bits = idt_bw - for i in range(n_inputs): - packed_bits = self.get_instream_width(i) - packed_hls_type = "ap_uint<%d>" % packed_bits - npy_in = "%s/input_%d.npy" % (code_gen_dir, i) - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - i, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - n_inputs = self.get_n_inputs() - for i in range(n_inputs): - packed_bits = self.get_instream_width(i) - packed_hls_type = "ap_uint<%d>" % packed_bits - stream_name = "in%d_%s" % (i, self.hls_sname()) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [] - n_inputs = self.get_n_inputs() - in_streams = [] - for i in range(n_inputs): - in_streams.append("in%d_%s" % (i, self.hls_sname())) - in_stream_names = ",".join(in_streams) - comp_call = "StreamingConcat(%s, out_%s, NumReps);" % ( - in_stream_names, - self.hls_sname(), - ) - self.code_gen_dict["$DOCOMPUTE$"] = [comp_call] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - n_inputs = self.get_n_inputs() - in_streams = [] - for i in range(n_inputs): - iwidth = self.get_instream_width(i) - in_streams.append("hls::stream> &in%d_%s" % (iwidth, i, self.hls_sname())) - in_streams = ",".join(in_streams) - total_width = self.get_input_datatype().bitwidth() * self.get_total_elems() - out_stream = "hls::stream> &out_%s" % ( - total_width, - self.hls_sname(), - ) - blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream) - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls] - - def pragmas(self): - n_inputs = self.get_n_inputs() - pragmas = [] - for i in range(n_inputs): - pragmas.append("#pragma HLS INTERFACE axis port=in%d_%s" % (i, self.hls_sname())) - self.code_gen_dict["$PRAGMAS$"] = pragmas - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + inp_values = [] + for inp in node.input: + inp_values.append(context[inp]) + result = np.concatenate(inp_values, axis=-1) + context[node.output[0]] = result def get_instream_width_padded(self, ind=0): in_width = self.get_instream_width(ind) diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 33c542d79d..96f49069c7 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,33 +26,24 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math import numpy as np -import os +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator: # input 0 is the input tensor, shape NHWC = (1, IFMDim, IFMDim, IFMChannels) # output 0 is the output tensor, shape NHWC: # = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels) -# note: the actual data layout produced by the hlslib kernels is different -# for depthwise and non-depthwise ops. -# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD) -# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD) -# see test_fpgadataflow_slidingwindow.py for an example of how to transform -# between the two layouts - -class ConvolutionInputGenerator(HLSCustomOp): - """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator - (sliding window) function variants. Depending on the combination of - attributes (e.g. depthwise or not, whether k % stride is 0) a different - variant will be picked for the actual HLS implementation.""" +class ConvolutionInputGenerator(HWCustomOp): + """Abstraction layer for HW implementation of ConvolutionInputGenerator""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -82,23 +73,16 @@ def get_nodeattr_types(self): "distributed", {"auto", "block", "distributed", "ultra"}, ), + "parallel_window": ("i", False, 0, {0, 1}), + # 1D (True) or 2D (False) spatial data + "is1D": ("i", False, 0), + # Enable reprogrammable implementation to change FM dimensions, + # stride, or dilation during runtime (requires parallel_window = 0) + "dynamic_mode": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_nodeattr(self, name): - # overriding get_nodeattr to check for square kernel/img.. requirement - # since this can't be done with the attribute restriction in nodeattr_types - # TODO non-square can be enabled in theory but needs testing - ret = super().get_nodeattr(name) - props_to_check = ["ConvKernelDim", "IFMDim", "OFMDim", "Stride", "Dilation"] - if name in props_to_check: - is_square = ret[0] == ret[1] - assert is_square, "Only square %s supported" % name - if name == "Dilation": - assert ret[0] == ret[1] == 1, "Only dilation=1 supported" - return ret - def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -137,8 +121,12 @@ def get_folded_output_shape(self, ind=0): ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + if self.use_parallel_window_output(): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) return folded_oshape def make_shape_compatible_op(self, model): @@ -177,330 +165,93 @@ def get_instream_width(self, ind=0): return in_width def get_outstream_width(self, ind=0): - """Returns stream width, input and output stream width are equal for - the sliding window function, so the function to determine the input - stream width can be reused.""" - return self.get_instream_width() + if self.use_parallel_window_output(): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() num_output_elems = np.prod(folded_oshape[:-1]) return num_output_elems - def get_exp_cycles(self): - simd = self.get_nodeattr("SIMD") + def get_1d_conv_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # For the kernel, presenting the input data of size D as + # [H, W] = [Y, X] = [1, D] or [D, 1] + # effectively gives the same result. + # For consistency and ease of programming, this function + # returns the attributes of the layer as follows: + # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. + # The dummy ('1') dimension is the Y-dimension. ifm_ch = self.get_nodeattr("IFMChannels") - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv - cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) - max_cycles = max(cycles_write_block, cycles_read_block) - exp_cycles = ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + # see defines() for an explanation + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) - return int(exp_cycles) + def get_exp_cycles(self): + return 0 def bram_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "block" or ram_style == "auto": - ram_depth = ifm_dim * ifm_ch / simd - if ram_depth <= 512: - ram_width = 36 - elif ram_depth <= 1024: - ram_width = 18 - elif ram_depth <= 2048: - ram_width = 9 - elif ram_depth <= 4096: - ram_width = 4 - elif ram_depth <= 8192: - ram_width = 2 - else: - ram_width = 1 - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) - * math.ceil(ifm_dim * ifm_ch / simd / ram_depth) - ) - ) - else: - return 0 + return 0 def lut_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "distributed": - ram_luts = int( - (k + stride) - * ( - simd - * self.get_input_datatype().bitwidth() - * math.ceil(ifm_dim * ifm_ch / simd / 64) - ) - ) - else: - ram_luts = 0 - return 300 + ram_luts + return 0 def uram_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / 64) - * math.ceil(ifm_dim * ifm_ch / simd / 4096) - ) - ) - else: - return 0 + return 0 def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # using Im2Col node to calculate output node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] - - def defines(self, var): - numReps = 1 - ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_dim = self.get_nodeattr("IFMDim") + k = self.get_nodeattr("ConvKernelDim") + s = self.get_nodeattr("Stride") + d = self.get_nodeattr("Dilation") ifm_ch = self.get_nodeattr("IFMChannels") - ofm_dim = self.get_nodeattr("OFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - simd = self.get_nodeattr("SIMD") - ifm_precision = self.get_input_datatype().bitwidth() - - self.code_gen_dict["$DEFINES$"] = [ - """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n - #define Input_precision1 {}\n #define IFMDim1 {}\n - #define OFMDim1 {}\n #define SIMD1 {}\n - #define Stride1 {}\n #define numReps {}""".format( - k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + im2col_node = helper.make_node( + "Im2Col", + [node.input[0]], + [node.output[0]], + domain="qonnx.custom_op.general", + stride=[s[0], s[1]], + kernel_size=[k[0], k[1]], + dilations=[d[0], d[1]], + input_shape="(1,{},{},{})".format(ifm_dim[0], ifm_dim[1], ifm_ch), ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) + graph_im2col = helper.make_graph( + nodes=[im2col_node], + name="single-im2col-exec", + inputs=[inp], + outputs=[outp], ) - def docompute(self): - node = self.onnx_node - ram_style = self.get_nodeattr("ram_style") - map_to_hls_ram_style = { - "auto": "ap_resource_dflt()", - "block": "ap_resource_bram()", - "distributed": "ap_resource_lutram()", - "ultra": "ap_resource_uram()", - } - hls_ram_style = map_to_hls_ram_style[ram_style] - hls_call = node.op_type - - # check which ConvolutionInputGenerator is needed - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - - if k % stride != 0: - hls_call += "_kernel_stride" - - if self.get_nodeattr("depthwise") == 1: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}_dws (in0_{}, out_{}, numReps, {});""".format( - hls_call, self.hls_sname(), self.hls_sname(), hls_ram_style - ) - ] - else: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0_{}, out_{}, numReps, {});""".format( - hls_call, self.hls_sname(), self.hls_sname(), hls_ram_style - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{})""".format( - self.onnx_node.name, self.hls_sname(), self.hls_sname() - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs)) + model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype()) + # use execution function from Im2Col node + # this automatically updates the execution context + inst = getCustomOp(im2col_node) + inst.execute_node(context, model_im2col.graph) diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index e2cea6da6b..4f919d1b50 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,16 +27,18 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class DownSampler(HLSCustomOp): - """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. +class DownSampler(HWCustomOp): + """Abstraction layer for HW implementation of DownSampling Basically performs a down sampling of the image removing rows and columns.""" def __init__(self, onnx_node, **kwargs): @@ -174,197 +176,54 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] - - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] - - idim = self.get_nodeattr("ImgDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] - - simd = self.get_nodeattr("SIMD") - self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)] - - stride = self.get_nodeattr("Stride") - self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)] - - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" - sname = self.hls_sname() - self.code_gen_dict["$DOCOMPUTE$"] = [ - f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0_{sname}, out_{sname}, numReps);""" - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # using Im2Col node to calculate output node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + ifm_dim = self.get_nodeattr("ImgDim") + stride = self.get_nodeattr("Stride") + ifm_ch = self.get_nodeattr("NumChannels") + # check if 1D or 2D case + if self.get_nodeattr("is1D"): + if self.get_nodeattr("is1D_unitx"): + ifm_dim_w = 1 + sw = 1 + ifm_dim_h = ifm_dim + sh = stride + else: + ifm_dim_h = 1 + sh = 1 + ifm_dim_w = ifm_dim + sw = stride else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + ifm_dim_h = ifm_dim_w = ifm_dim + sh = sw = stride + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + im2col_node = helper.make_node( + "Im2Col", + [node.input[0]], + [node.output[0]], + domain="qonnx.custom_op.general", + stride=[sh, sw], + kernel_size=[1, 1], + input_shape="(1,{},{},{})".format(ifm_dim_h, ifm_dim_w, ifm_ch), + ) + graph_im2col = helper.make_graph( + nodes=[im2col_node], + name="single-im2col-exec", + inputs=[inp], + outputs=[outp], + ) - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs)) + model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype()) + # use execution function from Im2Col node + # this automatically updates the execution context + inst = getCustomOp(im2col_node) + inst.execute_node(context, model_im2col.graph) diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py new file mode 100644 index 0000000000..8943ffc9e3 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py @@ -0,0 +1,177 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class DuplicateStreams(HWCustomOp): + """Abstraction layer for HW implementation of DuplicateStreams""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # how many duplicated output streams to create + "NumOutputStreams": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_num_output_streams(self): + return self.get_nodeattr("NumOutputStreams") + + def get_normal_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ch]) + return ishape + + def get_folded_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + # since the output shape of both out streams are the same + # return independently from index + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + # since the output shape of both out streams are the same + # return independently from index + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + num_out = self.get_num_output_streams() + assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs" + + oshape = self.get_normal_output_shape() + ret = super().make_const_shape_op(oshape) + ret.output[:] = self.onnx_node.output + return ret + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + odt = self.get_output_datatype() + for my_out in self.onnx_node.output: + model.set_tensor_datatype(my_out, odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return self.get_num_output_streams() * np.prod(self.get_folded_output_shape()[1:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # passing input to both outputs to make + # abstraction layer executable + node = self.onnx_node + inp = context[node.input[0]] + exp_shape = self.get_normal_input_shape() + + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + for outp in node.output: + context[outp] = output + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + n_outputs = self.get_num_output_streams() + sname = self.hls_sname() + intf_names["m_axis"] = [] + for i in range(n_outputs): + intf_names["m_axis"].append( + ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) + ) + return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out0": [], "out1": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py new file mode 100644 index 0000000000..5767028ea7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/fmpadding.py @@ -0,0 +1,172 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class FMPadding(HWCustomOp): + """Abstraction layer for HW impplementation of FMPadding. + Pads input image by given amount.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # spatial size of input images + "ImgDim": ("ints", True, []), # [H, W] = [Y, X] + # total padding (per dimension) to apply + "Padding": ( + "ints", + True, + [1, 1, 1, 1], + ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] + # number of channels in input image + "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), + # FINN input datatype + "inputDataType": ("s", True, ""), + # shape describing input vecs per execution + "numInputVectors": ("i", False, 1), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_padded_odim(self): + "Return the padded spatial size of the output." + idim_h, idim_w = self.get_nodeattr("ImgDim") + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + odim_h = idim_h + pad_h + odim_w = idim_w + pad_w + return [odim_h, odim_w] + + def get_exp_cycles(self): + odim_h, odim_w = self.get_padded_odim() + channels = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + batch_size = self.get_nodeattr("numInputVectors") + exp_cycles = (channels / simd) * batch_size * odim_h * odim_w + return int(exp_cycles) + + def get_normal_input_shape(self, ind=0): + idim_h, idim_w = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + ishape = (1, idim_h, idim_w, num_ch) + return ishape + + def get_normal_output_shape(self, ind=0): + odim_h, odim_w = self.get_padded_odim() + num_ch = self.get_nodeattr("NumChannels") + + oshape = (1, odim_h, odim_w, num_ch) + return oshape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for FMPadding." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + # the hlslib op always pads with zeros, so ensure that the DataType + # is able to represent zeros + assert ret.allowed(0), "FMPadding_Batch DataType must support zero" + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self, ind=0): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return ibits * simd + + def get_outstream_width(self, ind=0): + obits = self.get_output_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return obits * simd + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def execute_node(self, context, graph): + # simulate behavior with Python functionality + node = self.onnx_node + pad = self.get_nodeattr("Padding") + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + result = np.pad( + inp_values, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant" + ) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py index bc686bc6d2..b1f9900070 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, Advanced Micro Devices, Inc. +# Copyright (c) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,15 +28,13 @@ import numpy as np -import os import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class FMPadding_Pixel(HLSCustomOp): +class FMPadding_Pixel(HWCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -153,183 +151,25 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - odim_h, odim_w = self.get_padded_odim() - stride_h, stride_w = self.get_nodeattr("Stride") - self.code_gen_dict["$DEFINES$"] = [ - """ - #define OutputDim_x {}\n - #define OutputDim_y {}\n - #define Stride_x {}\n - #define Stride_y {}\n - #define NumChannels {}\n - #define SIMD {}\n - """.format( - odim_w, - odim_h, - stride_w, - stride_h, - self.get_nodeattr("NumChannels"), - self.get_nodeattr("SIMD"), - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - in_t = self.get_input_datatype().get_hls_datatype_str() - odim_h, odim_w = self.get_padded_odim() - stride_h, stride_w = self.get_nodeattr("Stride") - hls_call = "FMPadding_Pixel_Nonsquare" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0, out);""".format( - hls_call, in_t - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_hls_type, packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # simulate behavior with Python functionality node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) + s_h, s_w = self.get_nodeattr("Stride") + inp_values = context[node.input[0]] + ishape = inp_values.shape + result = np.zeros( + ( + ishape[0], + ishape[1] + (ishape[1] - 1) * (s_h - 1), + ishape[2] + (ishape[2] - 1) * (s_w - 1), + ishape[3], ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" + ) + for b in range(ishape[0]): + for h in range(ishape[1]): + for w in range(ishape[2]): + oh = h * s_h + ow = w * s_w + result[b, oh, ow, :] = inp_values[b, h, w, :] + oshape = context[node.output[0]].shape + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool.py b/src/finn/custom_op/fpgadataflow/globalaccpool.py new file mode 100644 index 0000000000..4008cdc7c9 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/globalaccpool.py @@ -0,0 +1,160 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class GlobalAccPool(HWCustomOp): + """Abstraction layer for HW implementation of GlobalAccPool""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ch]) + return ishape + + def get_folded_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + if len(vecs) == 1: + oshape = tuple(vecs + [ch]) + elif len(vecs) == 3: + oshape = tuple([vecs[0]] + [1, 1, ch]) + return oshape + + def get_folded_output_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + unfolded_shape = list(self.get_normal_output_shape()) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + oshape = tuple(unfolded_shape[:-1] + [folds, pe]) + return oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + # determine data type from image size and input type + idt = DataType[self.get_nodeattr("inputDataType")] + vecs = list(self.get_nodeattr("numInputVectors")) + npixels = vecs[-1] * vecs[-2] + if idt.signed(): + extreme_value = npixels * idt.min() + else: + extreme_value = npixels * idt.max() + return DataType.get_smallest_possible(extreme_value) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[1:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * idim * idim + Channels/PE + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + folds = int(ch / pe) + return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) + + def execute_node(self, context, graph): + # simulate behavior with Python functionality + node = self.onnx_node + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + result = np.apply_over_axes(np.sum, inp_values, [1, 2]) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py deleted file mode 100644 index 5ed440dace..0000000000 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ /dev/null @@ -1,352 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class GlobalAccPool_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib AccPool_Batch function.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, 0), - "PE": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_normal_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ch]) - return ishape - - def get_folded_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - if len(vecs) == 1: - oshape = tuple(vecs + [ch]) - elif len(vecs) == 3: - oshape = tuple([vecs[0]] + [1, 1, ch]) - return oshape - - def get_folded_output_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - unfolded_shape = list(self.get_normal_output_shape()) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - oshape = tuple(unfolded_shape[:-1] + [folds, pe]) - return oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") - - # verify that input data is 2D - if len(self.get_nodeattr("numInputVectors")) != 3: - info_messages.append("""GlobalAccPool_Batch requires 2D data input.""") - raise Exception - - return info_messages - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - # determine data type from image size and input type - idt = DataType[self.get_nodeattr("inputDataType")] - vecs = list(self.get_nodeattr("numInputVectors")) - npixels = vecs[-1] * vecs[-2] - if idt.signed(): - extreme_value = npixels * idt.min() - else: - extreme_value = npixels * idt.max() - return DataType.get_smallest_possible(extreme_value) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[1:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * idim * idim + Channels/PE - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - folds = int(ch / pe) - return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format( - self.get_normal_input_shape()[1], - self.get_nodeattr("NumChannels"), - self.get_input_datatype().get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.get_output_datatype().get_hls_datatype_str(), - self.hls_sname(), - self.hls_sname(), - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{})""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py new file mode 100644 index 0000000000..405c47a08d --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -0,0 +1,81 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls +from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls +from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls +from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls +from finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls import ( + ConvolutionInputGenerator_hls, +) +from finn.custom_op.fpgadataflow.hls.downsampler_hls import DownSampler_hls +from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls +from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls +from finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls import FMPadding_Pixel_hls +from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls +from finn.custom_op.fpgadataflow.hls.iodma_hls import IODMA_hls +from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls +from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls +from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls +from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls +from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import ( + StreamingDataWidthConverter_hls, +) +from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls +from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls +from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls +from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls +from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls +from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls + +custom_op = dict() + +# make sure new HLSCustomOp subclasses are imported here so that they get +# registered and plug in correctly into the infrastructure +custom_op["AddStreams_hls"] = AddStreams_hls +custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls +custom_op["CheckSum_hls"] = CheckSum_hls +custom_op["ConvolutionInputGenerator_hls"] = ConvolutionInputGenerator_hls +custom_op["DownSampler_hls"] = DownSampler_hls +custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls +custom_op["FMPadding_hls"] = FMPadding_hls +custom_op["FMPadding_Pixel_hls"] = FMPadding_Pixel_hls +custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls +custom_op["IODMA_hls"] = IODMA_hls +custom_op["LabelSelect_hls"] = LabelSelect_hls +custom_op["Lookup_hls"] = Lookup_hls +custom_op["Pool_hls"] = Pool_hls +custom_op["StreamingConcat_hls"] = StreamingConcat_hls +custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls +custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls +custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls +custom_op["Thresholding_hls"] = Thresholding_hls +custom_op["TLastMarker_hls"] = TLastMarker_hls +custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls +custom_op["MVAU_hls"] = MVAU_hls +custom_op["VVAU_hls"] = VVAU_hls diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py similarity index 63% rename from src/finn/custom_op/fpgadataflow/addstreams_batch.py rename to src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py index 51de1590ec..a3f0e043f8 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,81 +28,24 @@ import numpy as np import os -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.addstreams import AddStreams +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class AddStreams_Batch(HLSCustomOp): +class AddStreams_hls(AddStreams, HLSBackend): """Class that corresponds to finn-hlslib AddStreams_Batch function.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = super().get_nodeattr_types() - my_attrs.update( - { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - "inFIFODepths": ("ints", False, [2, 2]), - } - ) + my_attrs = {} + my_attrs.update(AddStreams.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich]) - return ishape - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - assert ich % pe == 0, "PE must divide NumChannels" - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich // pe, pe]) - return ishape - - def get_normal_output_shape(self, ind=0): - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input1 shape." - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) - assert ishape == exp_ishape, "Unexpected input2 shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # enforce output data type (calculated based on idt) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -125,42 +68,6 @@ def verify_node(self): return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - # we need to set output datatype to the next larger int or uint - # enhancement: consider specifying w/ explicit outputDataType attribute - # to allow overflow and use the same idt if user wants - idt = DataType[self.get_nodeattr("inputDataType")] - if idt.signed(): - return DataType.get_smallest_possible(2 * idt.min()) - else: - return DataType.get_smallest_possible(2 * idt.max()) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -304,10 +211,10 @@ def strm_decl(self): ) def docompute(self): - node = self.onnx_node + hls_call = "AddStreams_Batch" self.code_gen_dict["$DOCOMPUTE$"] = [ """{}<{}, {}, {}, {}, {}> (in0_{}, in1_{}, out_{}, 1);""".format( - node.op_type, + hls_call, self.get_nodeattr("PE"), self.get_input_datatype().get_hls_datatype_str(), self.get_input_datatype().get_hls_datatype_str(), @@ -319,34 +226,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, @@ -372,21 +251,3 @@ def pragmas(self): "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - sname = self.hls_sname() - swidth = self.get_instream_width_padded() - intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] - return intf_names - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - "in1": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py similarity index 73% rename from src/finn/custom_op/fpgadataflow/channelwise_op_batch.py rename to src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py index 5e0063ac33..14efa113dd 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,11 +28,11 @@ import numpy as np import os -import warnings from math import ceil from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, @@ -46,38 +46,7 @@ # the ... here can be any shape (representing groups of vectors) -def get_smallest_possible(vals): - """Returns smallest (fewest bits) possible DataType that can represent - value. Prefers unsigned integers where possible.""" - vals = np.array(vals, dtype=np.float64) - for v in vals: - assert int(v) == v, "Error float value" - - for k in DataType.get_accumulator_dt_cands(): - dt = DataType[k] - - if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: - # not currently supported - continue - - if (dt.min() <= vals).all() and (vals <= dt.max()).all(): - return dt - - warnings.warn( - """InferChannelwiseLinearLayer: Output values may not be - representable with supported data types. - Setting maximum width data type available. - This will lead to errors if there are no constrains on the input - """ - ) - - if (0 <= vals).all(): - return DataType["UINT64"] - else: - return DataType["INT64"] - - -class ChannelwiseOp_Batch(HLSCustomOp): +class ChannelwiseOp_hls(ChannelwiseOp, HLSBackend): """Class that corresponds to finn-hls Thresholding_Batch function. It can implement a variety of channel-wise parametrized operations, including Add, Mul and multi-thresholding. @@ -87,76 +56,11 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - # channelwise "map" function to apply: - # one of cmp_le, cmp_ge, add, mul - "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}), - "PE": ("i", True, 0), - "NumChannels": ("i", True, 0), - # string defining memory resource type for parameters - "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "paramDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(ChannelwiseOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def calc_tmem(self): - """Calculates and returns TMEM, the depth of the memory used - to store the channelwise op parameters.""" - chn = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return chn // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - # implement tensor with correct shape - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # check input datatype against property - idt = model.get_tensor_datatype(node.input[0]) - - exp_idt_name = self.get_nodeattr("inputDataType") - if exp_idt_name != idt.name: - func = self.get_nodeattr("Func") - assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer" - - self.set_nodeattr("inputDataType", idt.name) - # update the func in ['add','mul'] cases - - # get parameter ranges - param = model.get_initializer(node.input[1]) - param_min = min(param.flatten()) - param_max = max(param.flatten()) - - # set function and determine output data type - if func == "add": - out_min = idt.min() + param_min - out_max = idt.max() + param_max - odt = get_smallest_possible([out_min, out_max]) - elif func == "mul": - possible_limits = [] - possible_limits += [idt.min() * param_min] - possible_limits += [idt.min() * param_max] - possible_limits += [idt.max() * param_min] - possible_limits += [idt.max() * param_max] - odt = get_smallest_possible(possible_limits) - - self.set_nodeattr("outputDataType", odt.name) - - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -213,52 +117,6 @@ def lut_estimation(self): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - fold = ich // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [ich]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" @@ -452,7 +310,6 @@ def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] - # TODO check and add whatever missing def defines(self, var): numInputVectors = list(self.get_nodeattr("numInputVectors")) numReps = numInputVectors[0] @@ -487,19 +344,6 @@ def read_npy_data(self): ) ) - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): tmpl_args = self.get_template_param_values() # TODO: why put some template parameters into defines and not others? @@ -551,9 +395,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, @@ -582,13 +423,6 @@ def pragmas(self): self.code_gen_dict["$PRAGMAS$"].append( ("#pragma HLS ARRAY_PARTITION variable=threshs.parameters " "complete dim=1") ) - # self.code_gen_dict["$PRAGMAS$"].append( - # ( - # "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " - # "complete dim=3" - # ) - # ) - # set resource type ram_style = self.get_nodeattr("ram_style") pe = self.get_nodeattr("PE") diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py similarity index 97% rename from src/finn/custom_op/fpgadataflow/checksum.py rename to src/finn/custom_op/fpgadataflow/hls/checksum_hls.py index 6121c5d97a..8a72ca3c6c 100644 --- a/src/finn/custom_op/fpgadataflow/checksum.py +++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,11 +32,12 @@ import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class CheckSum(HLSCustomOp): +class CheckSum_hls(HWCustomOp, HLSBackend): """Class that corresponds to custom_hls checksum function.""" def __init__(self, onnx_node, **kwargs): @@ -52,7 +54,8 @@ def get_nodeattr_types(self): # folded shape of input/output "folded_shape": ("ints", True, []), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def make_shape_compatible_op(self, model): @@ -302,9 +305,6 @@ def dataoutstrm(self): 'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");' % code_gen_dir, ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """using T = ap_uint;\n void {}(hls::stream &in0_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py new file mode 100644 index 0000000000..008fa9cee8 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py @@ -0,0 +1,267 @@ +# Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.concat import StreamingConcat +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingConcat_hls(StreamingConcat, HLSBackend): + """Streaming concatenation node with dynamically generated HLS. + Only supports concatenating along the last axis.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingConcat.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def generate_params(self, model, path): + elems_per_stream = self.get_nodeattr("ElemsPerStream") + inp_streams = [] + commands = [] + idt = self.get_input_datatype() + total_elems = self.get_total_elems() + total_bw = idt.bitwidth() * total_elems + for i, elems in enumerate(elems_per_stream): + bw = idt.bitwidth() * elems + inp_stream = "hls::stream > &in%d" % (bw, i) + inp_streams.append(inp_stream) + cmd = "in%d.read()" % i + commands.append(cmd) + out_stream = "hls::stream > &out" % (total_bw) + inp_streams.append(out_stream) + + impl_hls_code = [] + impl_hls_code.append("void StreamingConcat(") + impl_hls_code.append(",".join(inp_streams)) + impl_hls_code.append(", unsigned int numReps) {") + impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {") + impl_hls_code.append("#pragma HLS PIPELINE II=1") + impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw) + # FIXME: the order of streams for concatenation works out differently + # for cppsim vs rtlsim, addressed via reversing the order of commands + # for now + impl_hls_code.append("#ifdef __SYNTHESIS__") + impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");") + impl_hls_code.append("#else") + impl_hls_code.append("out_elem = (" + ",".join(commands) + ");") + impl_hls_code.append("#endif") + impl_hls_code.append("out.write(out_elem);") + impl_hls_code.append("}") + impl_hls_code.append("}") + impl_hls_code = "\n".join(impl_hls_code) + + impl_filename = "{}/concat_impl.hpp".format(path) + f_impl = open(impl_filename, "w") + f_impl.write(impl_hls_code) + f_impl.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + n_inps = len(self.onnx_node.input) + ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)] + folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)] + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + export_idt = self.get_input_datatype() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + for i in range(n_inps): + inp = context[node.input[i]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i] + # reshape input into folded form + inp = inp.reshape(folded_ishapes[i]) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + io_dict = {"inputs": {}, "outputs": {"out": []}} + for i in range(n_inps): + nbits = self.get_instream_width(i) + rtlsim_inp = npy_to_rtlsim_input( + "%s/input_%d.npy" % (code_gen_dir, i), + export_idt, + nbits, + reverse_inner=True, + ) + io_dict["inputs"]["in%d" % i] = rtlsim_inp + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + self.rtlsim_multi_io(sim, io_dict) + rtlsim_output = io_dict["outputs"]["out"] + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + reverse_inner=True, + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"'] + + def defines(self, var): + num_reps = self.get_nodeattr("numInputVectors") + num_reps = np.prod(num_reps) + self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps] + + def read_npy_data(self): + n_inputs = self.get_n_inputs() + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + idt = self.get_input_datatype() + idt_bw = idt.bitwidth() + elem_hls_type = idt.get_hls_datatype_str() + elem_bits = idt_bw + for i in range(n_inputs): + packed_bits = self.get_instream_width(i) + packed_hls_type = "ap_uint<%d>" % packed_bits + npy_in = "%s/input_%d.npy" % (code_gen_dir, i) + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + i, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + n_inputs = self.get_n_inputs() + for i in range(n_inputs): + packed_bits = self.get_instream_width(i) + packed_hls_type = "ap_uint<%d>" % packed_bits + stream_name = "in%d_%s" % (i, self.hls_sname()) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [] + n_inputs = self.get_n_inputs() + in_streams = [] + for i in range(n_inputs): + in_streams.append("in%d_%s" % (i, self.hls_sname())) + in_stream_names = ",".join(in_streams) + comp_call = "StreamingConcat(%s, out_%s, NumReps);" % ( + in_stream_names, + self.hls_sname(), + ) + self.code_gen_dict["$DOCOMPUTE$"] = [comp_call] + + def blackboxfunction(self): + n_inputs = self.get_n_inputs() + in_streams = [] + for i in range(n_inputs): + iwidth = self.get_instream_width(i) + in_streams.append("hls::stream> &in%d_%s" % (iwidth, i, self.hls_sname())) + in_streams = ",".join(in_streams) + total_width = self.get_input_datatype().bitwidth() * self.get_total_elems() + out_stream = "hls::stream> &out_%s" % ( + total_width, + self.hls_sname(), + ) + blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream) + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls] + + def pragmas(self): + n_inputs = self.get_n_inputs() + pragmas = [] + for i in range(n_inputs): + pragmas.append("#pragma HLS INTERFACE axis port=in%d_%s" % (i, self.hls_sname())) + self.code_gen_dict["$PRAGMAS$"] = pragmas + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py similarity index 57% rename from src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py rename to src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py index 046e8e096d..4a5c02ee06 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py @@ -1,4 +1,5 @@ # Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,15 +32,13 @@ import os import warnings from qonnx.core.datatype import DataType -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( + ConvolutionInputGenerator, +) +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -# This operation should only be used for 1D convolutions. Either the -# IFMDim_H or IFMDim_W should be '1', which represents the so-called -# dummy-dimension - # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D: # input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels) # output 0 is the output tensor, shape NHWC: @@ -53,7 +52,7 @@ # between the two layouts -class ConvolutionInputGenerator1D(HLSCustomOp): +class ConvolutionInputGenerator_hls(ConvolutionInputGenerator, HLSBackend): """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator (sliding window) function variants. Depending on the combination of attributes (e.g. depthwise or not, whether dilation is 0) a different @@ -63,175 +62,49 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] - "IFMChannels": ("i", True, 0), - "IFMDim": ("ints", True, []), # [H, W] = [Y, X] - "OFMDim": ("ints", True, []), # [H, W] = [Y, X] - "SIMD": ("i", True, 0), - "Stride": ("ints", True, []), # [H, W] = [Y, X] - "Dilation": ("ints", True, []), # [H, W] = [Y, X] - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - "depthwise": ("i", False, 0, {0, 1}), - # FPGA resource type for ConvolutionInputGenerator input buffer - # auto -- let Vivado HLS decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use URAM - "ram_style": ( - "s", - False, - "distributed", - {"auto", "block", "distributed", "ultra"}, - ), - "parallel_window": ("i", False, 0, {0, 1}), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - simd = self.get_nodeattr("SIMD") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - if self.use_parallel_window_output(): - wf = int((ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) - else: - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) - return folded_oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = model.get_tensor_datatype(node.input[0]) - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - in_width = simd * ibits - return in_width - - def get_outstream_width(self, ind=0): - if self.use_parallel_window_output(): - # feed all window pixels in parallel - k_h, k_w = self.get_nodeattr("ConvKernelDim") - return self.get_instream_width() * k_h * k_w - else: - # if parallel variant not in use: same width for output and input stream - return self.get_instream_width() - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - num_output_elems = np.prod(folded_oshape[:-1]) - return num_output_elems - def get_swu_variant(self): - # checks which variant of the 1D ConvolutionInputGenerator (SWU) can be used - # We have 5 variants: ConvolutionInputGenerator_1D_parallel, + # checks which variant of the ConvolutionInputGenerator (SWU) can be used + # For the 2D case, we have 4 variants: + # ConvolutioninputGenerator, ConvolutioninputGenerator_dws, + # ConvolutioninputGenerator_kernel_stride, ConvolutioninputGenerator_kernel_stride_dws + # For the 1D case, we have 5 variants: ConvolutionInputGenerator_1D_parallel, # ConvolutionInputGenerator_1D_dws_naive, ConvolutionInputGenerator_1D, # ConvolutioninputGenerator_1D_dws, ConvolutionInputGenerator_1D_dws_stride is_dws = self.get_nodeattr("depthwise") - is_strided = np.prod(self.get_nodeattr("Stride")) > 1 - is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2 - is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1 - if self.use_parallel_window_output(): - return "ConvolutionInputGenerator_1D_parallel" - if not is_dws: - return "ConvolutionInputGenerator_1D" - if is_dws: - if (is_strided and not is_stride_2) or (is_dilated): - return "ConvolutionInputGenerator_1D_dws_naive" - elif is_stride_2: - return "ConvolutionInputGenerator_1D_dws_stride" - else: - return "ConvolutionInputGenerator_1D_dws" - - def get_1d_conv_attrs_normalized(self): - # support both (1, D) and (D, 1) cases transparently: - # For the kernel, presenting the input data of size D as - # [H, W] = [Y, X] = [1, D] or [D, 1] - # effectively gives the same result. - # For consistency and ease of programming, this function - # returns the attributes of the layer as follows: - # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. - # The dummy ('1') dimension is the Y-dimension. - ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") - dilation = self.get_nodeattr("Dilation") - - # see defines() for an explanation - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - ofm_dim = ofm_dim[::-1] - k = k[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + if self.get_nodeattr("is1D"): + is_strided = np.prod(self.get_nodeattr("Stride")) > 1 + is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2 + is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1 + if self.use_parallel_window_output(): + return "ConvolutionInputGenerator_1D_parallel" + if not is_dws: + return "ConvolutionInputGenerator_1D" + if is_dws: + if (is_strided and not is_stride_2) or (is_dilated): + return "ConvolutionInputGenerator_1D_dws_naive" + elif is_stride_2: + return "ConvolutionInputGenerator_1D_dws_stride" + else: + return "ConvolutionInputGenerator_1D_dws" + else: + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + hls_call = "ConvolutionInputGenerator" + if k % stride != 0: + hls_call += "_kernel_stride" + if is_dws: + hls_call += "_dws" + return hls_call def use_parallel_window_output(self): - # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to + if not self.get_nodeattr("is1D"): + return False + # If 1D, check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to # feed window in parallel to the following layer, enabling full SIMD unfolding. stride = self.get_nodeattr("Stride") dilation = self.get_nodeattr("Dilation") @@ -261,61 +134,88 @@ def use_parallel_window_output(self): def get_exp_cycles(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() - - # since mmv != 1 is not supported yet, we set mmv for now to 1 - # mmv = 1 - # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - swu_variant = self.get_swu_variant() - if swu_variant == "ConvolutionInputGenerator_1D_parallel": - exp_cycles = k_w + ofm_dim_w - elif swu_variant == "ConvolutionInputGenerator_1D": - exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - exp_cycles = ( - 1 + ofm_dim_w * k_w * ifm_ch / simd + (ifm_ch / simd) * (k_w - 1) - (k_w - 1) - ) - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - cycles_read_block = ifm_dim_w * ifm_ch / simd - cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd - exp_cycles = cycles_read_block + cycles_write_block + # 2D case + if not self.get_nodeattr("is1D"): + ifm_ch = self.get_nodeattr("IFMChannels") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + # 1D case + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + swu_variant = self.get_swu_variant() + if swu_variant == "ConvolutionInputGenerator_1D_parallel": + exp_cycles = k_w + ofm_dim_w + elif swu_variant == "ConvolutionInputGenerator_1D": + exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + exp_cycles = ( + 1 + ofm_dim_w * k_w * ifm_ch / simd + (ifm_ch / simd) * (k_w - 1) - (k_w - 1) + ) + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + cycles_read_block = ifm_dim_w * ifm_ch / simd + cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd + exp_cycles = cycles_read_block + cycles_write_block return int(exp_cycles) def bram_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": return 0 if ram_style == "block" or ram_style == "auto": - if swu_variant == "ConvolutionInputGenerator_1D": - ram_depth = (k_w - 1) * ifm_ch / simd - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - ram_depth = ifm_dim_w * ifm_ch / simd - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - ram_depth = k_w * ifm_ch / simd + if not is1D: + ram_depth = ifm_dim * ifm_ch / simd + else: + if swu_variant == "ConvolutionInputGenerator_1D": + ram_depth = (k_w - 1) * ifm_ch / simd + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + ram_depth = ifm_dim_w * ifm_ch / simd + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + ram_depth = k_w * ifm_ch / simd + # after calculate the ram_depth depending on the variant + # determine ram_width if ram_depth <= 512: ram_width = 36 elif ram_depth <= 1024: @@ -328,27 +228,48 @@ def bram_estimation(self): ram_width = 2 else: ram_width = 1 + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) - depth_mul = math.ceil(ram_depth / 18432) - return width_mul * depth_mul + if not is1D: + depth_mul = math.ceil(ifm_dim * ifm_ch / simd / ram_depth) + return int((k + stride) * width_mul * depth_mul) + else: + depth_mul = math.ceil(ram_depth / 18432) + return int(width_mul * depth_mul) else: return 0 def lut_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": ram_luts = math.ceil(simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64) - elif ram_style == "distributed": + if ram_style == "distributed": + if not is1D: + ram_luts = int( + (k + stride) + * ( + simd + * self.get_input_datatype().bitwidth() + * math.ceil(ifm_dim * ifm_ch / simd / 64) + ) + ) if swu_variant == "ConvolutionInputGenerator_1D": ram_luts = math.ceil(self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64) elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": @@ -364,34 +285,51 @@ def lut_estimation(self): def uram_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": return 0 - elif ram_style == "ultra": - if swu_variant == "ConvolutionInputGenerator_1D": - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096) - return width_mul * depth_mul - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096) - return width_mul * depth_mul - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil(k_w * ifm_ch / simd / 4096) - return width_mul * depth_mul + if ram_style == "ultra": + if not is1D: + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / 64) + * math.ceil(ifm_dim * ifm_ch / simd / 4096) + ) + ) + else: + if swu_variant == "ConvolutionInputGenerator_1D": + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096) + return width_mul * depth_mul + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096) + return width_mul * depth_mul + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil(k_w * ifm_ch / simd / 4096) + return width_mul * depth_mul else: return 0 @@ -485,18 +423,28 @@ def global_includes(self): def defines(self, var): numReps = 1 - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") simd = self.get_nodeattr("SIMD") ifm_precision = self.get_input_datatype().bitwidth() + if not is1D: + ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_ch = self.get_nodeattr("IFMChannels") + ofm_dim = self.get_nodeattr("OFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + swu_variant = self.get_swu_variant() + # check all different 1D scenarios if swu_variant in [ "ConvolutionInputGenerator_1D_parallel", "ConvolutionInputGenerator_1D", @@ -523,7 +471,7 @@ def defines(self, var): numReps, ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws": + elif swu_variant == "ConvolutionInputGenerator_1D_dws": self.code_gen_dict["$DEFINES$"] = [ """ #define ConvKernelDim1_x {}\n @@ -543,7 +491,7 @@ def defines(self, var): numReps, ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": self.code_gen_dict["$DEFINES$"] = [ """ #define ConvKernelDim1_x {}\n @@ -567,44 +515,16 @@ def defines(self, var): numReps, ) ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + # default to 2D cases + else: + self.code_gen_dict["$DEFINES$"] = [ + """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n + #define Input_precision1 {}\n #define IFMDim1 {}\n + #define OFMDim1 {}\n #define SIMD1 {}\n + #define Stride1 {}\n #define numReps {}""".format( + k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps + ) + ] def docompute(self): ram_style = self.get_nodeattr("ram_style") @@ -617,7 +537,7 @@ def docompute(self): hls_ram_style = map_to_hls_ram_style[ram_style] swu_variant = self.get_swu_variant() - # check which ConvolutionInputGenerator is needed + # check which 1D ConvolutionInputGenerator is needed if swu_variant == "ConvolutionInputGenerator_1D_parallel": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -634,7 +554,7 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws": + elif swu_variant == "ConvolutionInputGenerator_1D_dws": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -642,7 +562,7 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_stride": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_stride": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -650,7 +570,7 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -658,6 +578,13 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style + ) + ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -695,9 +622,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): if self.use_parallel_window_output(): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ @@ -714,12 +638,3 @@ def blackboxfunction(self): self.onnx_node.name, self.hls_sname(), self.hls_sname() ) ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py new file mode 100644 index 0000000000..56f472b9c0 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py @@ -0,0 +1,165 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.downsampler import DownSampler +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class DownSampler_hls(DownSampler, HLSBackend): + """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. + Basically performs a down sampling of the image removing rows and columns.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(DownSampler.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("ImgDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + simd = self.get_nodeattr("SIMD") + self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)] + + stride = self.get_nodeattr("Stride") + self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def docompute(self): + dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" + sname = self.hls_sname() + self.code_gen_dict["$DOCOMPUTE$"] = [ + f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0_{sname}, out_{sname}, numReps);""" + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py similarity index 66% rename from src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py rename to src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py index 1f2d1b79be..e19149435e 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,91 +28,24 @@ import numpy as np import os -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class DuplicateStreams_Batch(HLSCustomOp): +class DuplicateStreams_hls(DuplicateStreams, HLSBackend): """Class that corresponds to finn-hlslib function of the same name.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, 0), - "PE": ("i", True, 0), - # how many duplicated output streams to create - "NumOutputStreams": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(DuplicateStreams.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_num_output_streams(self): - return self.get_nodeattr("NumOutputStreams") - - def get_normal_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ch]) - return ishape - - def get_folded_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - # since the output shape of both out streams are the same - # return independently from index - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - # since the output shape of both out streams are the same - # return independently from index - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - num_out = self.get_num_output_streams() - assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs" - - oshape = self.get_normal_output_shape() - ret = super().make_const_shape_op(oshape) - ret.output[:] = self.onnx_node.output - return ret - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - odt = self.get_output_datatype() - for my_out in self.onnx_node.output: - model.set_tensor_datatype(my_out, odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -136,35 +69,6 @@ def verify_node(self): return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return self.get_num_output_streams() * np.prod(self.get_folded_output_shape()[1:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def generate_params(self, model, path): n_outputs = self.get_num_output_streams() inp_streams = [] @@ -292,28 +196,6 @@ def global_includes(self): def defines(self, var): self.code_gen_dict["$DEFINES$"] = [] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): n_outputs = self.get_num_output_streams() self.code_gen_dict["$STREAMDECLARATIONS$"] = [] @@ -371,9 +253,6 @@ def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): n_outputs = self.get_num_output_streams() inp_streams = [] @@ -406,24 +285,3 @@ def pragmas(self): "#pragma HLS INTERFACE axis port=out%d_%s" % (i, self.hls_sname()) ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - n_outputs = self.get_num_output_streams() - sname = self.hls_sname() - intf_names["m_axis"] = [] - for i in range(n_outputs): - intf_names["m_axis"].append( - ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) - ) - return intf_names - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out0": [], "out1": []}, - } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py similarity index 51% rename from src/finn/custom_op/fpgadataflow/fmpadding_batch.py rename to src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py index 5bd5e07916..d57699af05 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,14 +28,13 @@ import numpy as np import os -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class FMPadding_Batch(HLSCustomOp): +class FMPadding_hls(FMPadding, HLSBackend): """Corresponds to finn-hlslib FMPadding_Batch function. Pads input image by given amount.""" @@ -43,125 +42,11 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - # spatial size of input images - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - # total padding (per dimension) to apply - "Padding": ( - "ints", - True, - [1, 1, 1, 1], - ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] - # number of channels in input image - "NumChannels": ("i", True, 0), - # SIMD Input parallelism - "SIMD": ("i", False, 1), - # FINN input datatype - "inputDataType": ("s", True, ""), - # shape describing input vecs per execution - "numInputVectors": ("i", False, 1), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(FMPadding.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_padded_odim(self): - "Return the padded spatial size of the output." - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - odim_h = idim_h + pad_h - odim_w = idim_w + pad_w - return [odim_h, odim_w] - - def get_exp_cycles(self): - odim_h, odim_w = self.get_padded_odim() - channels = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = (channels / simd) * batch_size * odim_h * odim_w - return int(exp_cycles) - - def get_normal_input_shape(self, ind=0): - idim_h, idim_w = self.get_nodeattr("ImgDim") - num_ch = self.get_nodeattr("NumChannels") - ishape = (1, idim_h, idim_w, num_ch) - return ishape - - def get_normal_output_shape(self, ind=0): - odim_h, odim_w = self.get_padded_odim() - num_ch = self.get_nodeattr("NumChannels") - - oshape = (1, odim_h, odim_w, num_ch) - return oshape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_ishape[-1] / simd) - folded_ishape = normal_ishape[:-1] + [fold, simd] - return tuple(folded_ishape) - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_oshape[-1] / simd) - folded_oshape = normal_oshape[:-1] + [fold, simd] - return tuple(folded_oshape) - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for SameResize." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - # the hlslib op always pads with zeros, so ensure that the DataType - # is able to represent zeros - assert ret.allowed(0), "FMPadding_Batch DataType must support zero" - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output. (Same as input datatype)""" - return self.get_input_datatype() - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return ibits * simd - - def get_outstream_width(self, ind=0): - obits = self.get_output_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return obits * simd - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] @@ -214,48 +99,8 @@ def defines(self, var): ) ] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): in_t = self.get_input_datatype().get_hls_datatype_str() - node = self.onnx_node - idim_h, idim_w = self.get_nodeattr("ImgDim") pad = self.get_nodeattr("Padding") pad_h = pad[0] + pad[2] @@ -264,7 +109,7 @@ def docompute(self): is_square_pad = pad_h == pad_w if is_square_img and is_square_pad: - hls_call = node.op_type + hls_call = "FMPadding_Batch" self.code_gen_dict["$DOCOMPUTE$"] = [ """{} (in0_{}, out_{}, numReps);""".format( @@ -281,37 +126,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits @@ -326,15 +140,6 @@ def blackboxfunction(self): ) ] - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py new file mode 100644 index 0000000000..b7ba301fbc --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py @@ -0,0 +1,167 @@ +# Copyright (c) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class FMPadding_Pixel_hls(FMPadding_Pixel, HLSBackend): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(FMPadding_Pixel.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + odim_h, odim_w = self.get_padded_odim() + stride_h, stride_w = self.get_nodeattr("Stride") + self.code_gen_dict["$DEFINES$"] = [ + """ + #define OutputDim_x {}\n + #define OutputDim_y {}\n + #define Stride_x {}\n + #define Stride_y {}\n + #define NumChannels {}\n + #define SIMD {}\n + """.format( + odim_w, + odim_h, + stride_w, + stride_h, + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + ) + ] + + def docompute(self): + in_t = self.get_input_datatype().get_hls_datatype_str() + odim_h, odim_w = self.get_padded_odim() + stride_h, stride_w = self.get_nodeattr("Stride") + hls_call = "FMPadding_Pixel_Nonsquare" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{});""".format( + hls_call, in_t, self.hls_sname(), self.hls_sname() + ) + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py new file mode 100644 index 0000000000..9b2a7b25b0 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -0,0 +1,176 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class GlobalAccPool_hls(GlobalAccPool, HLSBackend): + """Class that corresponds to finn-hlslib AccPool_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(GlobalAccPool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") + + # verify that input data is 2D + if len(self.get_nodeattr("numInputVectors")) != 3: + info_messages.append("""GlobalAccPool_Batch requires 2D data input.""") + raise Exception + + return info_messages + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format( + self.get_normal_input_shape()[1], + self.get_nodeattr("NumChannels"), + self.get_input_datatype().get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.get_output_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname(), + ) + ] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{})""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py similarity index 98% rename from src/finn/custom_op/fpgadataflow/iodma.py rename to src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index bb3de268a0..8d9903f0f5 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,8 @@ import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp # the IODMA inerfaces a memory-mapped AXI interface and an AXI stream # direction "in": pulls data from AXI-MM to AXI stream @@ -72,7 +74,7 @@ # -the folded shape is not defined -class IODMA(HLSCustomOp): +class IODMA_hls(HWCustomOp, HLSBackend): """Class that corresponds to finn-hlslib DMA function(s).""" def __init__(self, onnx_node, **kwargs): @@ -97,7 +99,8 @@ def get_nodeattr_types(self): # name of axi-mm interface "intfName": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def get_normal_input_shape(self, ind=0): @@ -407,18 +410,6 @@ def pragmas(self): def execute_node(self, context, graph): pass - def dataoutstrm(self): - pass - - def read_npy_data(self): - pass - - def save_as_npy(self): - pass - - def strm_decl(self): - pass - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() if self.get_nodeattr("direction") == "out": diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py similarity index 57% rename from src/finn/custom_op/fpgadataflow/labelselect_batch.py rename to src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py index 60d3eb9154..1e2c0d034a 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,99 +28,24 @@ import numpy as np import os -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class LabelSelect_Batch(HLSCustomOp): +class LabelSelect_hls(LabelSelect, HLSBackend): """Class that corresponds to finn-hlslib LabelSelect_Batch function.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - odt_name = self.get_nodeattr("outputDataType") - if odt_name == "": - # If not provided compute min size - labels = self.get_nodeattr("Labels") - odt = DataType.get_smallest_possible(labels - 1) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(odt.bitwidth(), 8) - new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) - odt = DataType[new_odt_name] - odt_name = odt.name - self.set_nodeattr("outputDataType", odt_name) def get_nodeattr_types(self): - my_attrs = { - "Labels": ("i", True, 0), - "PE": ("i", True, 0), - "K": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - "outputDataType": ("s", False, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(LabelSelect.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - nlabels = self.get_nodeattr("Labels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [nlabels]) - return ishape - - def get_folded_input_shape(self, ind=0): - nlabels = self.get_nodeattr("Labels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert nlabels % pe == 0, "PE must divide Labels" - folds = int(nlabels / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k = self.get_nodeattr("K") - vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple(vecs + [k]) - return oshape - - def get_folded_output_shape(self, ind=0): - k = self.get_nodeattr("K") - vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple(vecs + [k, 1]) - return oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - return helper.make_node( - "RandomNormal", - inputs=[], - outputs=[self.onnx_node.output[0]], - mean=0.0, - scale=1.0, - dtype=TensorProto.INT64, - shape=list(oshape), - ) - - def infer_node_datatype(self, model): - node = self.onnx_node - # check input datatype against property - idt = model.get_tensor_datatype(node.input[0]) - self.set_nodeattr("inputDataType", idt.name) - - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -150,30 +75,6 @@ def verify_node(self): return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - ret = DataType[self.get_nodeattr("outputDataType")] - return ret - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - return self.get_output_datatype().bitwidth() - - def get_number_output_values(self): - return self.get_nodeattr("K") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -284,24 +185,9 @@ def read_npy_data(self): ) ) - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): - node = self.onnx_node self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format( - node.op_type, + """LabelSelect_Batch<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format( self.get_nodeattr("Labels"), self.get_nodeattr("PE"), self.get_nodeattr("K"), @@ -312,34 +198,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, @@ -352,18 +210,3 @@ def blackboxfunction(self): self.hls_sname(), ) ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_exp_cycles(self): - nlabels = self.get_nodeattr("Labels") - pe = self.get_nodeattr("PE") - exp_cycles = nlabels / pe - return int(exp_cycles) diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py new file mode 100644 index 0000000000..ba44deb898 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py @@ -0,0 +1,337 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from math import ceil, log2 +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.lookup import Lookup +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + + +class Lookup_hls(Lookup, HLSBackend): + "Streaming elementwise HLS lookup, mapping indices to values." + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(Lookup.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + mem_mode = self.get_nodeattr("mem_mode") + global_incls = [] + global_incls.append('#include "lookup.hpp"') + if mem_mode == "internal_embedded": + global_incls.append('#include "embeddings.hpp"') + self.code_gen_dict["$GLOBALS$"] = global_incls + + def defines(self, var): + n_inputs = np.prod(self.get_folded_input_shape()[:-1]) + dtype = self.get_input_datatype() + elem_hls_type = dtype.get_hls_datatype_str() + emb_type = DataType[self.get_nodeattr("EmbeddingType")] + emb_hls_type = emb_type.get_hls_datatype_str() + emb_dim = self.get_nodeattr("EmbeddingDim") + mem_mode = self.get_nodeattr("mem_mode") + my_defines = [] + my_defines.append("#define NumInputs %d" % n_inputs) + if mem_mode == "external": + ext_mem_width = self.get_nodeattr("ext_mem_width") + ext_mem_emb_size = self.get_folded_output_shape()[-2] + ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) + my_defines.append("#define MemBits %d" % ext_mem_width) + my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size) + my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align) + my_defines.append("#define T_SRC %s" % elem_hls_type) + my_defines.append("#define T_DST ap_uint") + elif mem_mode == "internal_embedded": + my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")) + my_defines.append("#define EmbeddingDim %d" % emb_dim) + my_defines.append("#define InputType %s" % elem_hls_type) + my_defines.append("#define EmbeddingType %s" % emb_hls_type) + self.code_gen_dict["$DEFINES$"] = my_defines + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "int64_t" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", %s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + "false", + ) + ] + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_embedded": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup(in0_%s, out_%s, embeddings);""" + % (self.hls_sname(), self.hls_sname()) + ] + elif mem_mode == "external": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup_ext(in0_%s, out_%s, mem, size, oob_count, + oob_irq);""" + % (self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + ibits = self.get_instream_width() + packed_input_hls_type = "ap_uint<%d>" % ibits + obits = self.get_outstream_width() + packed_output_hls_type = "ap_uint<%d>" % obits + if mem_mode == "internal_embedded": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_input_hls_type, + self.hls_sname(), + packed_output_hls_type, + self.hls_sname(), + ) + ] + elif mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void " + + self.onnx_node.name + + "(hls::stream &in0_%s, hls::stream &out_%s, " + % (self.hls_sname(), self.hls_sname()) + + "T_DST const *const mem, unsigned const size, " + + "unsigned &oob_count, bool &oob_irq)" + ] + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()] + my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname()) + my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return") + if mem_mode == "internal_embedded": + my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM") + elif mem_mode == "external": + my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=size bundle=control") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=oob_count bundle=control") + my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") + else: + raise Exception("Unrecognized mem_mode: " + mem_mode) + self.code_gen_dict["$PRAGMAS$"] = my_pragmas + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + embeddings = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "internal_embedded": + code_gen_dir = path + weight_filename = "{}/embeddings.hpp".format(code_gen_dir) + edt = DataType[self.get_nodeattr("EmbeddingType")] + # obits = self.get_outstream_width() + # packed_output_hls_type = "ap_uint<%d>" % obits + assert np.vectorize(edt.allowed)( + embeddings + ).all(), "Embeddings can't be expressed with type %s" % str(edt) + # reverse innertmost dim in embeddings to remain compatible with + # how we normally encode the data in FINN + embeddings_rev = np.flip(embeddings, -1) + embeddings_hls_code = numpy_to_hls_code(embeddings_rev, edt, "embeddings", True, False) + f_thresh = open(weight_filename, "w") + f_thresh.write(embeddings_hls_code) + f_thresh.close() + elif mem_mode == "external": + edt = DataType[self.get_nodeattr("EmbeddingType")] + ext_mem_width = self.get_nodeattr("ext_mem_width") + assert edt.bitwidth() == 8, ( + "Lookup with mem_mode=external " + + "only works with 8-bit embeddings but found " + + str(edt) + ) + emb_dim = self.get_nodeattr("EmbeddingDim") + # need to zero-pad embeddings in external mode for burst alignment + # compute how much padding we need + emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1] + ext_mem_emb_size = self.get_folded_output_shape()[-2] + ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) + align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align) + pad_amount = align_factor - emb_dim + embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)]) + # reshape for packing the innermost dim + embeddings_padded = embeddings_padded.reshape(-1, emb_elems_per_ext_mem_width) + weight_filename = "%s/%s.dat" % (path, self.onnx_node.name) + ret = pack_innermost_dim_as_hex_string( + embeddings_padded, edt, ext_mem_width, True, prefix="" + ) + with open(weight_filename, "w") as f: + for current_line in ret: + f.write(current_line + "\n") + else: + raise Exception("Unrecognized mem_mode: " + mem_mode) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = tuple(self.get_normal_input_shape()) + exp_oshape = tuple(self.get_normal_output_shape()) + folded_ishape = tuple(self.get_folded_input_shape()) + folded_oshape = tuple(self.get_folded_output_shape()) + mem_mode = self.get_nodeattr("mem_mode") + assert ( + mem_mode == "internal_embedded" + ), "Only mem_mode=internal_embedded is supported for simulation of Lookup layer" + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape.""" + export_idt = self.get_input_datatype() + odt = self.get_output_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + reverse_inner=True, + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def get_ap_int_max_w(self): + parent_max = super().get_ap_int_max_w() + mem_mode = self.get_nodeattr("mem_mode") + ext_mem_width = self.get_nodeattr("ext_mem_width") + if mem_mode == "external": + return max(ext_mem_width, parent_max) + else: + return parent_max diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py new file mode 100644 index 0000000000..94f8cc0845 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -0,0 +1,590 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation_hls: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MVAU_hls(MVAU, HLSBackend): + """Corresponds to finn-hlslib MatrixVectorActivation_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(MVAU.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "internal_decoupled" and mstyle == "distributed") or ( + mmode == "internal_embedded" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + tmem_style = self.get_nodeattr("ram_style_thresholds") + if (noact == 0) and (tmem_style == "distributed"): + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. + if var == "ipgen": + SIMD = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + condition = SIMD >= (MW / 1024) + msg = ( + f"HLS synthesis of MatrixVectorActivation requires: " + f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " + f"and MW={MW} for node: {self.onnx_node.name}." + ) + assert condition, msg + mem_mode = self.get_nodeattr("mem_mode") + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = np.prod(numInputVectors) + self.code_gen_dict["$DEFINES$"] = [ + """#define MW1 {}\n #define MH1 {}\n + #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n + #define TMEM1 {}\n #define numReps {}""".format( + self.get_nodeattr("MW"), + self.get_nodeattr("MH"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + self.calc_tmem(), + numReps, + ) + ] + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + if mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + if mem_mode == "internal_embedded": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Stream_Batch + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_embedded": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + + else: + raise Exception( + """Please set mem_mode to "internal_embedded" or "internal_decoupled", + currently no other parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "internal_embedded": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or external, + currently no other parameter value is supported!""" + ) + + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + # add resource pragma for thresholds if set + if ram_style_thresholds == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") + ) + elif ram_style_thresholds == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") + ) + elif ram_style_thresholds == "auto": + # no pragma needed + pass + else: + raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # internal_decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + self.reset_rtlsim(sim) + self.toggle_clk(sim) + if mem_mode == "external" or mem_mode == "internal_decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def instantiate_ip(self, cmd): + # instantiate the HLS IP + vlnv = self.get_nodeattr("ip_vlnv") + node_name = self.onnx_node.name + if self.get_nodeattr("mem_mode") == "internal_decoupled": + cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) + else: + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py similarity index 60% rename from src/finn/custom_op/fpgadataflow/pool_batch.py rename to src/finn/custom_op/fpgadataflow/hls/pool_hls.py index 8c7bc83141..64c6ec33f8 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,11 +30,12 @@ import os from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.pool import Pool from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class Pool_Batch(HLSCustomOp): +class Pool_hls(Pool, HLSBackend): """Class that corresponds to finn-hlslib Pool_batch function. Requires ConvolutionInputGenerator(depthwise == 1) to format its input @@ -54,148 +55,11 @@ class Pool_Batch(HLSCustomOp): """ def get_nodeattr_types(self): - my_attrs = { - "Channels": ("i", True, 0), - "PE": ("i", True, 1), - "KernelSize": ("ints", True, []), - # Function: - # - MaxPool - # - QuantAvgPool - # TODO add support for AvgPool and AccPool - "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}), - "OutImgDims": ("ints", True, []), - # FINN DataTypes for inputs/outputs - "InputDataType": ("s", True, ""), - "OutputDataType": ("s", True, ""), - "AccumBits": ("i", False, 0), - "Size": ("i", False, 1), - "BatchSize": ("i", False, 1), - } - - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(Pool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("InputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - fxn = self.get_nodeattr("Function") - odt = DataType[self.get_nodeattr("OutputDataType")] - - if fxn == "MaxPool": - # Same as input - idt = DataType[self.get_nodeattr("InputDataType")] - assert odt == idt, "In datatype must be equal to out datatype for Maxpool" - elif fxn == "QuantAvgPool": - idt = DataType[self.get_nodeattr("InputDataType")] - assert ( - idt.signed() == odt.signed() - ), """QuantAvgPool: Can't mix signed - and unsigned datatypes""" - else: - raise Exception("Pool_Batch doesn't currently support " + fxn) - - return odt - - def get_normal_input_shape(self, ind=0): - ifm_ch = self.get_nodeattr("Channels") - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - k = self.get_nodeattr("KernelSize") - k_prod = int(np.prod(k)) - ishape = (batch_size, *odims, k_prod * ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - assert ifm_ch % pe == 0, "PE must divide input channels" - fold = int(normal_ishape[-1] / pe) - folded_ishape = normal_ishape[:-1] + [fold, pe] - return tuple(folded_ishape) - - def get_normal_output_shape(self, ind=0): - ofm_ch = self.get_nodeattr("Channels") - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - oshape = (batch_size, *odims, ofm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - assert ifm_ch % pe == 0, "PE must divide input channels" - fold = int(ifm_ch / pe) - folded_oshape = normal_oshape[:-1] + [fold, pe] - return tuple(folded_oshape) - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[1:-1]) - - def get_exp_cycles(self): - # (Channels * kernel * kernel) / PE * odim * odim * batch_size - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - k = self.get_nodeattr("KernelSize") - k_prod = int(np.prod(k)) - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size - return int(exp_cycles) - - def get_instream_width(self, ind=0): - dt_bits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = int(dt_bits * pe) - return in_width - - def get_outstream_width(self, ind=0): - dt_bits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = int(dt_bits * pe) - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""Pool_Batch needs 1 data input""") - - # check supported function - fnx = self.get_nodeattr("Function") - if fnx in ["MaxPool", "QuantAvgPool"]: - info_messages.append("Attribute Function contains a supported pool function") - else: - info_messages.append("Attribute Function contains an unsupported pool function") - return info_messages - def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "maxpool.h"'] @@ -246,19 +110,6 @@ def read_npy_data(self): ) ) - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): idt = self.get_input_datatype() i_hls_dt = idt.get_hls_datatype_str() @@ -319,9 +170,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_ibits = self.get_instream_width() packed_in_hls_type = "ap_uint<%d>" % packed_ibits @@ -339,15 +187,6 @@ def blackboxfunction(self): ) ] - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py new file mode 100644 index 0000000000..d1f58d3e87 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py @@ -0,0 +1,215 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, +) +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# does not do anything at the ONNX node-by-node level, and input-output +# tensor shapes are the same. performs data width conversion at the rtlsim level + + +class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend): + """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch + function.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + numReps = 1 + numInWords = int(np.prod(self.get_folded_input_shape()[:-1])) + inWidth = self.get_nodeattr("inWidth") + outWidth = self.get_nodeattr("outWidth") + self.code_gen_dict["$DEFINES$"] = [ + "#define InWidth %d " % inWidth, + "#define OutWidth %d " % outWidth, + "#define NumInWords %d " % numInWords, + "#define numReps %d" % numReps, + ] + if self.needs_lcm(): + lcmWidth = self.get_iowidth_lcm() + assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation" + numLCMToOut = numInWords // (lcmWidth / inWidth) + self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth) + self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut)) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if self.needs_lcm(): + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> intermediate ("intermediate");'.format( + self.get_iowidth_lcm() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + # TODO continue with fxns below, they are copy-pasted + op = "StreamingDataWidthConverter_Batch" + if self.needs_lcm(): + self.code_gen_dict["$DOCOMPUTE$"] = [ + 'hls::stream> intermediate ("intermediate");'.format( + self.get_iowidth_lcm() + ), + "%s(in0_%s, intermediate, numReps);" + % (op, self.hls_sname()), + "%s(intermediate, out_%s, numReps);" + % (op, self.hls_sname()), + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s, numReps);" + % (op, self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + in_packed_bits = self.get_instream_width() + in_packed_hls_type = "ap_uint<%d>" % in_packed_bits + out_packed_bits = self.get_outstream_width() + out_packed_hls_type = "ap_uint<%d>" % out_packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + in_packed_hls_type, + self.hls_sname(), + out_packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + if self.needs_lcm(): + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # reshape input into folded shape + reshaped_input = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(exp_shape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert context[node.output[0]].shape == tuple( + exp_shape + ), """Output + shape doesn't match expected shape, should be same as input shape""" diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py similarity index 62% rename from src/finn/custom_op/fpgadataflow/eltwise.py rename to src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py index ab1dc00118..0d618d832a 100644 --- a/src/finn/custom_op/fpgadataflow/eltwise.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,111 +28,24 @@ import numpy as np import os -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class StreamingEltwise(HLSCustomOp): +class StreamingEltwise_hls(StreamingEltwise, HLSBackend): """Class that corresponds to finn-hlslib StreamingEltwise function.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = super().get_nodeattr_types() - my_attrs.update( - { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType0": ("s", True, ""), - "inputDataType1": ("s", True, ""), - # type of EltwiseFunction for the operation - "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - "inFIFODepths": ("ints", False, [2, 2]), - } - ) + my_attrs = {} + my_attrs.update(StreamingEltwise.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_eltwise_op_lambda(self): - eltwise_op = self.get_nodeattr("eltwiseOp") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - odt = self.get_output_datatype() - tin0 = idt0.get_hls_datatype_str() - tin1 = idt1.get_hls_datatype_str() - tout = odt.get_hls_datatype_str() - eltwise_ops = { - # "Add": "[](auto a, auto b) { return a + b; }", - # "Sub": "[](auto a, auto b) { return a - b; }", - # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", - "Add": f"add<{tin0}, {tin1}, {tout}>()", - "Sub": f"sub<{tin0}, {tin1}, {tout}>()", - "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", - } - return eltwise_ops[eltwise_op] - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich]) - return ishape - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - assert ich % pe == 0, "PE must divide NumChannels" - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich // pe, pe]) - return ishape - - def get_normal_output_shape(self, ind=0): - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input1 shape." - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) - assert ishape == exp_ishape, "Unexpected input2 shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt0 = model.get_tensor_datatype(node.input[0]) - if idt0 != self.get_input_datatype(0): - warn_str = "inputDataType0 changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype(0)), - str(idt0), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType0", idt0.name) - idt1 = model.get_tensor_datatype(node.input[1]) - if idt1 != self.get_input_datatype(1): - warn_str = "inputDataType1 changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype(1)), - str(idt1), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType1", idt1.name) - # enforce output data type (calculated based on idt) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -157,60 +70,6 @@ def verify_node(self): return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType" + str(ind))] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - op = self.get_nodeattr("eltwiseOp") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - assert idt0.signed() == idt1.signed(), ( - "%s: Inputs must have same signedness" % self.onnx_node.name - ) - idt0_min, idt0_max = idt0.min(), idt0.max() - idt1_min, idt1_max = idt1.min(), idt1.max() - cands = [ - idt0_min - idt1_min, - idt0_min - idt1_max, - idt0_max - idt1_min, - idt0_max - idt1_max, - ] - largest_magnitude = max(map(abs, cands)) - if op == "Add": - if idt0.signed(): - return DataType.get_smallest_possible(idt0.min() + idt1.min()) - else: - return DataType.get_smallest_possible(idt0.max() + idt1.max()) - elif op == "Sub": - return DataType.get_smallest_possible(-largest_magnitude) - elif op == "AbsDiff": - return DataType.get_smallest_possible(largest_magnitude) - else: - raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype(ind).bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -422,34 +281,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, @@ -475,10 +306,3 @@ def pragmas(self): "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - sname = self.hls_sname() - swidth = self.get_instream_width_padded() - intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] - return intf_names diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py new file mode 100755 index 0000000000..69db7b4606 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py @@ -0,0 +1,222 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingMaxPool_hls(StreamingMaxPool, HLSBackend): + """Class that corresponds to finn-hlslib StreamingMaxPool_batch function.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingMaxPool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""StreamingMaxPool_Batch needs 1 data input""") + + return info_messages + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + + def defines(self, var): + numReps = 1 + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + ceil_mode = self.get_nodeattr("CeilMode") + output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) + + if self.is_1d(): + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim {}\n #define PoolDim {}\n + #define NumChannels {}\n #define PE {}\n #define OutputSize {} + \n #define numReps {}""".format( + ifm_dim[1], + k[1], + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + output_size, + numReps, + ) + ] + else: + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim {}\n #define PoolDim {}\n + #define NumChannels {}\n #define numReps {}""".format( + ifm_dim[1], + k[1], + self.get_nodeattr("NumChannels"), + numReps, + ) + ] + + def docompute(self): + dtype = self.get_input_datatype() + if dtype.bitwidth() == 1: + if self.is_1d(): + raise Exception("Binary 1d MaxPool not implemented on HLS backend") + else: + op = "StreamingMaxPool" + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s);" + % (op, self.hls_sname(), self.hls_sname()) + ] + else: + dtype = self.get_input_datatype() + dtype_hls = dtype.get_hls_datatype_str() + minval_str = str(int(dtype.min())) + if self.is_1d(): + op = "StreamingMaxPool_Precision_1d" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """%s(in0_%s, out_%s);""" + % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) + ] + else: + op = "StreamingMaxPool_Precision" + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s);" + % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py similarity index 75% rename from src/finn/custom_op/fpgadataflow/thresholding_batch.py rename to src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 72ee2f7af6..b753bc7a03 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,15 +29,12 @@ import numpy as np import os import textwrap -import warnings from math import ceil, log2 from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) +from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, @@ -52,7 +49,7 @@ # the ... here can be any shape (representing groups of vectors) -class Thresholding_Batch(HLSCustomOp): +class Thresholding_hls(Thresholding, HLSBackend): """Class that corresponds to finn-hls Thresholding_Batch function.""" def __init__(self, onnx_node, **kwargs): @@ -60,30 +57,18 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { - # parallelization; channels thresholded per cycle - "PE": ("i", True, 0), - # number of channels (each may have different thresholds) - "NumChannels": ("i", True, 0), - # number of steps in thresholding function - "numSteps": ("i", True, 1), + # memory mode for the thresholds + # internal_embedded -- embedded thresholds + # internal_decoupled -- default, streaming thresholds with streamer packaged inside IP + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled"}, + ), # string defining memory type "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # FINN DataTypes for inputs, outputs - "inputDataType": ("s", True, ""), - "weightDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - # initialization value for the thresholding accumulator - "ActVal": ("i", False, 0), - # memory mode for the thresholds - # const -- embedded thresholds, default - # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const", {"const", "decoupled"}), - # (mem_mode = decoupled only) whether weights (thresholds) will be + # (mem_mode = internal_decoupled only) whether weights (thresholds) will be # writable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory @@ -94,58 +79,10 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(Thresholding.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def calc_tmem(self): - """Calculates and returns TMEM.""" - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return mh // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype().name), - str(idt.name), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - # TODO collect automatically from get_nodeattr_types - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required Threshold_Batch attributes do not exist.""") - - return info_messages - def bram_estimation(self): """Calculates BRAM cost if resource set to BRAM""" style = self.get_nodeattr("ram_style") @@ -177,55 +114,9 @@ def lut_estimation(self): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_weight_datatype(self): - """Returns FINN DataType of thresholds, here called weights.""" - return DataType[self.get_nodeattr("weightDataType")] - - def minimize_accumulator_width(self, model): - "Minimize threshold width ('accumulator width' here due to convention)" - thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - min_input = self.get_input_datatype().min() - max_input = self.get_input_datatype().max() - # get range required by threshold values - tdt_min = min(min_input, min_threshold) - tdt_max = max(max_input, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - self.set_nodeattr("weightDataType", tdt.name) - # Update QONNX DataType of tensor for consistency - model.set_tensor_datatype(self.onnx_node.input[1], tdt) - return DataType[self.get_nodeattr("weightDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if self.get_nodeattr("mem_mode") == "decoupled": + """Returns weight stream width. Used only in internal_decoupled mode.""" + if self.get_nodeattr("mem_mode") == "internal_decoupled": pe = self.get_nodeattr("PE") wp = self.get_weight_datatype().bitwidth() n_thres_steps = self.get_nodeattr("numSteps") @@ -236,44 +127,16 @@ def get_weightstream_width(self): def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" + by the AXI Stream spec. Used in internal_decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) def get_ap_int_max_w(self): - temp_value = super().get_ap_int_max_w() - weightstream = self.get_weightstream_width() - return max([weightstream, temp_value]) - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - fold = ich // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [ich]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) + ap_int_max_w = HLSBackend.get_ap_int_max_w(self) + if self.get_nodeattr("mem_mode") == "internal_decoupled": + weightstream = self.get_weightstream_width() + ap_int_max_w = max([weightstream, ap_int_max_w]) + return ap_int_max_w def get_template_param_values(self): """Returns the template parameter values according to input, output and weight @@ -288,50 +151,6 @@ def get_template_param_values(self): return ret - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for unsigned inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - tmem = mh // pe - assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" - if not self.get_input_datatype().signed(): - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" - ret = orig_thres_matrix - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (mh, 1)) - assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or @@ -345,7 +164,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ - threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + threshold_tensor = self.get_hw_compatible_threshold_tensor(weights) tdt = self.get_weight_datatype() assert np.vectorize(tdt.allowed)( threshold_tensor @@ -439,12 +258,12 @@ def generate_params(self, model, path): code_gen_dir = path thresholds = model.get_initializer(self.onnx_node.input[1]) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": # save thresholds in thresh.h weight_filename = "{}/thresh.h".format(code_gen_dir) self.make_weight_file(thresholds, "hls_header", weight_filename) - elif mem_mode == "decoupled": - # save decoupled weights for cppsim + elif mem_mode == "internal_decoupled": + # save internal_decoupled weights for cppsim weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) # also save weights as Verilog .dat file @@ -518,7 +337,7 @@ def execute_node(self, context, graph): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( @@ -531,7 +350,7 @@ def execute_node(self, context, graph): } self.rtlsim_multi_io(sim, io_dict) output = io_dict["outputs"]["out"] - elif self.get_nodeattr("mem_mode") == "const": + elif self.get_nodeattr("mem_mode") == "internal_embedded": output = self.rtlsim(sim, inp) else: raise Exception("Unrecognized mem_mode") @@ -557,7 +376,7 @@ def execute_node(self, context, graph): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] # TODO check and add whatever missing @@ -575,7 +394,7 @@ def defines(self, var): total_spatial_size, ) ] - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$DEFINES$"].append( "#define ActVal1 %d" % self.get_nodeattr("ActVal") ) @@ -609,7 +428,7 @@ def read_npy_data(self): ) ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": tdt = self.get_weight_datatype() elem_bits = tdt.bitwidth() packed_bits = self.get_weightstream_width() @@ -643,7 +462,7 @@ def strm_decl(self): ) ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream> weights_{} ("weights_{}");'.format( self.get_weightstream_width(), self.hls_sname(), self.hls_sname() @@ -652,20 +471,18 @@ def strm_decl(self): def docompute(self): tmpl_args = self.get_template_param_values() - node = self.onnx_node mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} + """Thresholding_Batch (in0_{}, out_{}, threshs, numReps);""".format( - node.op_type, tmpl_args["TSrcI"], tmpl_args["TDstI"], self.hls_sname(), self.hls_sname(), ) ] - elif mem_mode == "decoupled": + elif mem_mode == "internal_decoupled": # note that numReps is set to 1 in the invocation below, since # - for cppsim the repetition comes from the threshold stream reader+input # - for synth the unit runs continuously anyway (ap_ctrl_none) @@ -712,11 +529,8 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &out_{} @@ -728,7 +542,7 @@ def blackboxfunction(self): self.hls_sname(), ) ] - elif self.get_nodeattr("mem_mode") == "decoupled": + elif self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &weights_{}, @@ -755,7 +569,7 @@ def pragmas(self): ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": # the threshold tensor is acc_type [PE][TMEM][N_THRES] # partition for parallel access along PE and N_THRES # dimensions (dims 1 and 3) @@ -787,7 +601,7 @@ def pragmas(self): ram_style ) ) - elif self.get_nodeattr("mem_mode") == "decoupled": + elif self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() ) @@ -796,7 +610,7 @@ def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": node_name = self.onnx_node.name runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 sname = self.hls_sname() @@ -889,8 +703,8 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const": - # base class impl sufficient for const mode + elif mem_mode == "internal_embedded": + # base class impl sufficient for internal_embedded mode return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for Thresholding_Batch") @@ -899,7 +713,7 @@ def code_generation_ipi(self): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: @@ -931,7 +745,7 @@ def derive_characteristic_fxns(self, period): "outputs": {"out": []}, } mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: + if mem_mode in ["internal_decoupled", "external"]: n_weight_inps = self.calc_tmem() num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py similarity index 96% rename from src/finn/custom_op/fpgadataflow/tlastmarker.py rename to src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py index 9309841b2e..2e908016e7 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,10 +27,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class TLastMarker(HLSCustomOp): +class TLastMarker_hls(HWCustomOp, HLSBackend): """Node that adds/removes AXI stream TLAST signals where needed. Its behavior is transparent in node-by-node execution, only visible in IP-stitched rtlsim or actual hardware. @@ -56,7 +58,8 @@ def get_nodeattr_types(self): # Vitis docs recommend using qdma_axis for external, ap_axiu for internal "Protocol": ("s", False, "external", {"external", "internal"}), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def execute_node(self, context, graph): @@ -185,9 +188,6 @@ def docompute(self): def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = [] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): dyn_iters = self.get_nodeattr("DynIters") diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py new file mode 100644 index 0000000000..05d26eddb2 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -0,0 +1,175 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class UpsampleNearestNeighbour_hls(UpsampleNearestNeighbour, HLSBackend): + """ + Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. + Upsampling is done with the Nearest Neighbour algorithm. + The layer expects square feature maps for the in and output. + """ + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(UpsampleNearestNeighbour.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + pass + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("IFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + odim = self.get_nodeattr("OFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def docompute(self): + is_2d = self.get_nodeattr("DimMode") == 0 + batch = self.get_nodeattr("numInputVectors") + if is_2d: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" + % (self.hls_sname(), self.hls_sname()) + ] + else: + assert batch == 1, "1D upsampler currently needs numReps=1" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" + % (self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py new file mode 100644 index 0000000000..fbae9eb9b8 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -0,0 +1,462 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class VVAU_hls(VVAU, HLSBackend): + """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VVAU.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for VectorVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + if mem_mode == "external" or mem_mode == "internal_decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + dim_h, dim_w = self.get_nodeattr("Dim") + numReps = 1 * dim_h * dim_w + k_h, k_w = self.get_nodeattr("Kernel") + innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + + self.code_gen_dict["$DEFINES$"] = [ + """#define Channels1 {}\n #define InnerProdDim {}\n + #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( + self.get_nodeattr("Channels"), + innerProdDim, + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + numReps, + ) + ] + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + + if mem_mode == "internal_embedded": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_embedded": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "internal_embedded" or "internal_decoupled", + currently no other parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "internal_embedded": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or external, + currently no other parameter value is supported!""" + ) + + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + + def instantiate_ip(self, cmd): + # instantiate the HLS IP + vlnv = self.get_nodeattr("ip_vlnv") + node_name = self.onnx_node.name + if self.get_nodeattr("mem_mode") == "internal_decoupled": + cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) + else: + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py new file mode 100644 index 0000000000..d8210fd684 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -0,0 +1,476 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import subprocess +from abc import ABC, abstractmethod +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow import templates +from finn.util.basic import CppBuilder, get_rtlsim_trace_depth, make_build_dir +from finn.util.hls import CallHLS +from finn.util.pyverilator import make_single_source_file + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class HLSBackend(ABC): + """HLSBackend class all custom ops that correspond to a finn-hlslib + function are using functionality of. Contains different functions every HLS + custom node should have. Some as abstract methods, these have to be filled + when writing a new HLS custom op node.""" + + def get_nodeattr_types(self): + return { + "code_gen_dir_cppsim": ("s", False, ""), + "executable_path": ("s", False, ""), + "res_hls": ("s", False, ""), + } + + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + assert ( + code_gen_dir != "" + ), """Node attribute "code_gen_dir_ipgen" is + not set. Please run HLSSynthIP first.""" + verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name) + # default impl only returns the HLS verilog codegen dir + return [verilog_path] + + def get_all_verilog_filenames(self, abspath=False): + "Return list of all Verilog files used for this node." + + verilog_files = [] + verilog_paths = self.get_all_verilog_paths() + for verilog_path in verilog_paths: + for f in os.listdir(verilog_path): + if f.endswith(".v"): + if abspath: + verilog_files += [verilog_path + "/" + f] + else: + verilog_files += [f] + return verilog_files + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + verilog_files = self.get_all_verilog_filenames(abspath=True) + single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") + tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" + make_single_source_file(verilog_files, target_file) + + # build the Verilator emu library + sim = PyVerilator.build( + self.get_verilog_top_module_name() + ".v", + build_dir=tmp_build_dir, + verilog_path=[single_src_dir], + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipgen(self, model, fpgapart, clk): + """Generates c++ code and tcl script for ip generation.""" + node = self.onnx_node + + # generate top cpp file for ip generation + path = self.get_nodeattr("code_gen_dir_ipgen") + self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + self.generate_params(model, path) + self.global_includes() + self.defines("ipgen") + self.blackboxfunction() + self.pragmas() + self.docompute() + + template = templates.ipgen_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + # generate tcl script for ip generation + self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)] + self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir] + self.code_gen_dict["$FPGAPART$"] = [fpgapart] + self.code_gen_dict["$TOPFXN$"] = [node.name] + self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] + self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives() + self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() + + template = templates.ipgentcl_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + def ipgen_default_directives(self): + """Return list of default HLS synthesis directives""" + + default_directives = [ + "set_param hls.enable_hidden_option_error false", + "config_compile -disable_unroll_code_size_check -pipeline_style flp", + "config_interface -m_axi_addr64", + "config_rtl -module_auto_prefix", + "config_rtl -deadlock_detection none", + ] + return default_directives + + def ipgen_extra_directives(self): + "Return a list of extra tcl directives for HLS synthesis." + return [] + + def ipgen_singlenode_code(self): + """Builds the bash script for IP generation using the CallHLS utility.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + builder = CallHLS() + builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name)) + builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) + builder.build(code_gen_dir) + ipgen_path = builder.ipgen_path + assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path) + self.set_nodeattr("ipgen_path", ipgen_path) + ip_path = ipgen_path + "/sol1/impl/ip" + assert os.path.isdir(ip_path), "IPGen failed: %s not found. Check log under %s" % ( + ip_path, + code_gen_dir, + ) + self.set_nodeattr("ip_path", ip_path) + vlnv = "xilinx.com:hls:%s:1.0" % node.name + self.set_nodeattr("ip_vlnv", vlnv) + + def code_generation_cppsim(self, model): + """Generates c++ code for simulation (cppsim).""" + node = self.onnx_node + path = self.get_nodeattr("code_gen_dir_cppsim") + self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + self.generate_params(model, path) + self.global_includes() + self.defines("cppsim") + self.read_npy_data() + self.strm_decl() + self.pragmas() + self.docompute() + self.dataoutstrm() + self.save_as_npy() + + template = templates.docompute_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + vlnv = self.get_nodeattr("ip_vlnv") + cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)] + return cmd + + def compile_singlenode_code(self): + """Builds the bash script for compilation using the CppBuilder from + finn.util.basic and executes the script to produce the executable.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + builder = CppBuilder() + # to enable additional debug features please uncommand the next line + # builder.append_includes("-DDEBUG") + builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") + builder.append_includes("-I$FINN_ROOT/deps/cnpy/") + builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") + builder.append_includes("-I$FINN_ROOT/custom_hls") + builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) + builder.append_includes("--std=c++14") + builder.append_includes("-O3") + builder.append_sources(code_gen_dir + "/*.cpp") + builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") + builder.append_includes("-lz") + builder.set_executable_path(code_gen_dir + "/node_model") + builder.build(code_gen_dir) + self.set_nodeattr("executable_path", builder.executable_path) + + def dynamic_input_to_npy(self, context, count, target_dir=""): + """Saves input (given context) into .npy files. + + Count indicates the number of inputs that have to be saved.""" + node = self.onnx_node + if target_dir == "": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + if code_gen_dir == "": + raise Exception( + """ + Found no codegen dir for this node, did you run the prepare_cppsim transformation? + """ + ) + target_dir = code_gen_dir + # create a npy file for each input of the node (in_ind is input index) + # assuming dynamic inputs start from 0 + for in_ind in range(count): + current_input_name = node.input[in_ind] + input_array = context[current_input_name] + if in_ind == 0: + expected_inp_shape = self.get_folded_input_shape() + idt = self.get_input_datatype() + else: + expected_inp_shape = self.get_folded_input_shape(in_ind) + idt = self.get_input_datatype(in_ind) + reshaped_input = input_array.reshape(expected_inp_shape) + if idt == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(target_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + + def npy_to_dynamic_output(self, context): + """Reads the output from an output.npy file generated from cppsim and + places its content into the context dictionary.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + output = np.load("{}/output.npy".format(code_gen_dir)) + exp_shape = self.get_normal_output_shape() + context[node.output[0]] = output.reshape(exp_shape) + + def npy_to_dynamic_outputs(self, context, npy_list): + """Reads the output from .npy files generated from cppsim and places + their content into the context dictionary. + npy_list is a list specifying which files to read, and its order must + match the order of node outputs.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + for i in range(len(npy_list)): + output = np.load("{}/{}".format(code_gen_dir, npy_list[i])) + if i == 0: + exp_shape = self.get_normal_output_shape() + else: + exp_shape = self.get_normal_output_shape(i) + context[node.output[i]] = output.reshape(exp_shape) + + def exec_precompiled_singlenode_model(self): + """Executes precompiled executable.""" + executable_path = self.get_nodeattr("executable_path") + if executable_path == "": + raise Exception( + """ +Found no executable for this node, did you run the codegen and +compilation transformations? + """ + ) + process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE) + process_execute.communicate() + + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + return "V" + + def execute_node(self, context, graph): + """Executes single node using cppsim or rtlsim.""" + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + # save input(s) + self.dynamic_input_to_npy(context, 1) + # execute the precompiled model + self.exec_precompiled_singlenode_model() + # load output npy file + self.npy_to_dynamic_output(context) + elif mode == "rtlsim": + pass + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + @abstractmethod + def global_includes(self): + """Function to set the global includes for c++ code that has to be generated + for cppsim or rtlsim, is member function of HLSBackend class but has to + be filled by every node.""" + pass + + @abstractmethod + def defines(self, var): + """Function to set the define commands for c++ code that has to be generated + for cppsim or rtlsim, is member function of HLSBackend class but has to + be filled by every node. + + var: makes it possible to reuse the function for different c++ code generation. + I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are + added.""" + pass + + def read_npy_data(self): + """Function to generate the commands for reading data from .npy file in c++, + might need to be overwritten depending on custom op.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + """Function to generate the commands for the stream declaration in c++, + is member function of HLSBackend class but might need to be filled + by node.""" + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + @abstractmethod + def docompute(self): + """Function to generate the commands for the computational part of the + c++ code, is member function of HLSBackend class but has to be filled + by every node.""" + pass + + def dataoutstrm(self): + """Function to generate the commands for reading out data from c++ and convert + into npy format, is member function of HLSBackend class might need to be filled + by node.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + """Function to generate the commands for saving data in .npy file in c++""" + self.code_gen_dict["$SAVEASCNPY$"] = [] + + @abstractmethod + def blackboxfunction(self): + """Function to generate a blackbock function in c++ from which an IP block + will be generated, is member function of HLSBackend class but has to be filled + by every node.""" + pass + + def pragmas(self): + """Function to generate the pragma commands in c++, + might need to be overwritten depending on custom op.""" + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def get_ap_int_max_w(self): + """Return the maximum width of any ap_int used in this module. Used to set the + AP_INT_MAX_W definition for HLS.""" + instream = self.get_instream_width() + outstream = self.get_outstream_width() + ret = max([instream, outstream]) + assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret + return ret diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py similarity index 52% rename from src/finn/custom_op/fpgadataflow/hlscustomop.py rename to src/finn/custom_op/fpgadataflow/hwcustomop.py index 4fed8ed4b5..57c0fec067 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,24 +28,13 @@ import numpy as np import os -import subprocess import warnings from abc import abstractmethod from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io -from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple -from finn.util.basic import ( - CppBuilder, - get_rtlsim_trace_depth, - make_build_dir, - pyverilate_get_liveness_threshold_cycles, -) -from finn.util.hls import CallHLS -from finn.util.pyverilator import make_single_source_file - -from . import templates +from finn.util.basic import pyverilate_get_liveness_threshold_cycles try: from pyverilator import PyVerilator @@ -53,34 +42,21 @@ PyVerilator = None -class HLSCustomOp(CustomOp): - """HLSCustomOp class all custom ops that correspond to a finn-hlslib - function are based on. Contains different functions every fpgadataflow +class HWCustomOp(CustomOp): + """HWCustomOp class all custom ops that can be implemented with either + HLS or RTL backend are based on. Contains different functions every fpgadataflow custom node should have. Some as abstract methods, these have to be filled when writing a new fpgadataflow custom op node.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - self.code_gen_dict = {} - # getting templates from templates.py - - # template for single node execution - self.docompute_template = templates.docompute_template - - # templates for single node ip generation - # cpp file - self.ipgen_template = templates.ipgen_template - # tcl script - self.ipgentcl_template = templates.ipgentcl_template - def get_nodeattr_types(self): return { "backend": ("s", True, "fpgadataflow"), - "code_gen_dir_cppsim": ("s", False, ""), + "preferred_impl_style": ("s", False, "", {"", "hls", "rtl"}), "code_gen_dir_ipgen": ("s", False, ""), - "executable_path": ("s", False, ""), "ipgen_path": ("s", False, ""), "ip_path": ("s", False, ""), "ip_vlnv": ("s", False, ""), @@ -89,7 +65,6 @@ def get_nodeattr_types(self): "cycles_estimate": ("i", False, 0), "rtlsim_trace": ("s", False, ""), "res_estimate": ("s", False, ""), - "res_hls": ("s", False, ""), "res_synth": ("s", False, ""), "rtlsim_so": ("s", False, ""), # partitioning info @@ -151,68 +126,6 @@ def get_verilog_top_module_intf_names(self): intf_names["ap_none"] = [] return intf_names - def get_verilog_top_filename(self): - "Return the Verilog top module filename for this node." - - verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format( - self.get_nodeattr("code_gen_dir_ipgen"), - self.onnx_node.name, - self.get_verilog_top_module_name(), - ) - return verilog_file - - def get_all_verilog_paths(self): - "Return list of all folders containing Verilog code for this node." - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - assert ( - code_gen_dir != "" - ), """Node attribute "code_gen_dir_ipgen" is - not set. Please run HLSSynthIP first.""" - verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name) - # default impl only returns the HLS verilog codegen dir - return [verilog_path] - - def get_all_verilog_filenames(self, abspath=False): - "Return list of all Verilog files used for this node." - - verilog_files = [] - verilog_paths = self.get_all_verilog_paths() - for verilog_path in verilog_paths: - for f in os.listdir(verilog_path): - if f.endswith(".v"): - if abspath: - verilog_files += [verilog_path + "/" + f] - else: - verilog_files += [f] - return verilog_files - - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - verilog_files = self.get_all_verilog_filenames(abspath=True) - single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") - tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") - target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" - make_single_source_file(verilog_files, target_file) - - # build the Verilator emu library - sim = PyVerilator.build( - self.get_verilog_top_module_name() + ".v", - build_dir=tmp_build_dir, - verilog_path=[single_src_dir], - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim - def get_rtlsim(self): """Return a PyVerilator wrapper for the Verilator emulation library for this node.""" @@ -247,27 +160,27 @@ def uram_efficiency_estimation(self): def bram_estimation(self): """Function for BRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def uram_estimation(self): """Function for UltraRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def lut_estimation(self): """Function for LUT resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def dsp_estimation(self): """Function for DSP resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def get_exp_cycles(self): """Function for estimation of expected cycles for set folding, - is member function of HLSCustomOp class but has to be filled + is member function of HWCustomOp class but has to be filled by every node""" return 0 @@ -278,216 +191,6 @@ def get_op_and_param_counts(self): {op_ : , param_: }.""" return {} - def code_generation_ipgen(self, model, fpgapart, clk): - """Generates c++ code and tcl script for ip generation.""" - node = self.onnx_node - - # generate top cpp file for ip generation - path = self.get_nodeattr("code_gen_dir_ipgen") - self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] - self.generate_params(model, path) - self.global_includes() - self.defines("ipgen") - self.blackboxfunction() - self.pragmas() - self.docompute() - - template = self.ipgen_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - # generate tcl script for ip generation - self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)] - self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir] - self.code_gen_dict["$FPGAPART$"] = [fpgapart] - self.code_gen_dict["$TOPFXN$"] = [node.name] - self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] - self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives() - self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() - - template = self.ipgentcl_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def ipgen_default_directives(self): - """Return list of default HLS synthesis directives""" - - default_directives = [ - "set_param hls.enable_hidden_option_error false", - "config_compile -disable_unroll_code_size_check -pipeline_style flp", - "config_interface -m_axi_addr64", - "config_rtl -module_auto_prefix", - "config_rtl -deadlock_detection none", - ] - return default_directives - - def ipgen_extra_directives(self): - "Return a list of extra tcl directives for HLS synthesis." - return [] - - def ipgen_singlenode_code(self): - """Builds the bash script for IP generation using the CallHLS utility.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - builder = CallHLS() - builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name)) - builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) - builder.build(code_gen_dir) - ipgen_path = builder.ipgen_path - assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path) - self.set_nodeattr("ipgen_path", ipgen_path) - ip_path = ipgen_path + "/sol1/impl/ip" - assert os.path.isdir(ip_path), "IPGen failed: %s not found. Check log under %s" % ( - ip_path, - code_gen_dir, - ) - self.set_nodeattr("ip_path", ip_path) - vlnv = "xilinx.com:hls:%s:1.0" % node.name - self.set_nodeattr("ip_vlnv", vlnv) - - def code_generation_cppsim(self, model): - """Generates c++ code for simulation (cppsim).""" - node = self.onnx_node - path = self.get_nodeattr("code_gen_dir_cppsim") - self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] - self.generate_params(model, path) - self.global_includes() - self.defines("cppsim") - self.read_npy_data() - self.strm_decl() - self.pragmas() - self.docompute() - self.dataoutstrm() - self.save_as_npy() - - template = self.docompute_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def code_generation_ipi(self): - """Constructs and returns the TCL for node instantiation in Vivado IPI.""" - vlnv = self.get_nodeattr("ip_vlnv") - cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)] - return cmd - - def compile_singlenode_code(self): - """Builds the bash script for compilation using the CppBuilder from - finn.util.basic and executes the script to produce the executable.""" - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - builder = CppBuilder() - # to enable additional debug features please uncommand the next line - # builder.append_includes("-DDEBUG") - builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") - builder.append_includes("-I$FINN_ROOT/deps/cnpy/") - builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") - builder.append_includes("-I$FINN_ROOT/custom_hls") - builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) - builder.append_includes("--std=c++14") - builder.append_includes("-O3") - builder.append_sources(code_gen_dir + "/*.cpp") - builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") - builder.append_includes("-lz") - builder.set_executable_path(code_gen_dir + "/node_model") - builder.build(code_gen_dir) - self.set_nodeattr("executable_path", builder.executable_path) - - def dynamic_input_to_npy(self, context, count, target_dir=""): - """Saves input (given context) into .npy files. - - Count indicates the number of inputs that have to be saved.""" - node = self.onnx_node - if target_dir == "": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - if code_gen_dir == "": - raise Exception( - """ - Found no codegen dir for this node, did you run the prepare_cppsim transformation? - """ - ) - target_dir = code_gen_dir - # create a npy file for each input of the node (in_ind is input index) - # assuming dynamic inputs start from 0 - for in_ind in range(count): - current_input_name = node.input[in_ind] - input_array = context[current_input_name] - if in_ind == 0: - expected_inp_shape = self.get_folded_input_shape() - idt = self.get_input_datatype() - else: - expected_inp_shape = self.get_folded_input_shape(in_ind) - idt = self.get_input_datatype(in_ind) - reshaped_input = input_array.reshape(expected_inp_shape) - if idt == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(target_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - - def npy_to_dynamic_output(self, context): - """Reads the output from an output.npy file generated from cppsim and - places its content into the context dictionary.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - output = np.load("{}/output.npy".format(code_gen_dir)) - exp_shape = self.get_normal_output_shape() - context[node.output[0]] = output.reshape(exp_shape) - - def npy_to_dynamic_outputs(self, context, npy_list): - """Reads the output from .npy files generated from cppsim and places - their content into the context dictionary. - npy_list is a list specifying which files to read, and its order must - match the order of node outputs.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - for i in range(len(npy_list)): - output = np.load("{}/{}".format(code_gen_dir, npy_list[i])) - if i == 0: - exp_shape = self.get_normal_output_shape() - else: - exp_shape = self.get_normal_output_shape(i) - context[node.output[i]] = output.reshape(exp_shape) - - def exec_precompiled_singlenode_model(self): - """Executes precompiled executable.""" - executable_path = self.get_nodeattr("executable_path") - if executable_path == "": - raise Exception( - """ -Found no executable for this node, did you run the codegen and -compilation transformations? - """ - ) - process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE) - process_execute.communicate() - def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" @@ -501,12 +204,6 @@ def toggle_clk(self, sim): sim.io.ap_clk = 1 sim.io.ap_clk = 0 - def hls_sname(self): - """Get the naming convention used by Vitis HLS for stream signals - Example: the TDATA for a stream called "out" would be out_V_TDATA. - """ - return "V" - def rtlsim(self, sim, inp, inp2=None): """Runs the pyverilator simulation by passing the input values to the simulation, toggle the clock and observing the execution time. Function contains also an @@ -607,104 +304,19 @@ def rtlsim_multi_io(self, sim, io_dict): ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) - def execute_node(self, context, graph): - """Executes single node using cppsim or rtlsim.""" - mode = self.get_nodeattr("exec_mode") - if mode == "cppsim": - # save input(s) - self.dynamic_input_to_npy(context, 1) - # execute the precompiled model - self.exec_precompiled_singlenode_model() - # load output npy file - self.npy_to_dynamic_output(context) - elif mode == "rtlsim": - pass - - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - def generate_params(self, model, path): """Function to generate parameters (i.e. weights and thresholds), - is member function of HLSCustomOp class but has to be filled - by every node.""" + is member function of HWCustomOp class but has to be filled + by every node that needs to generate parameters.""" pass @abstractmethod def get_number_output_values(self): """Function to get the number of expected output values, - is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def global_includes(self): - """Function to set the global includes for c++ code that has to be generated - for cppsim or rtlsim, is member function of HLSCustomOp class but has to - be filled by every node.""" - pass - - @abstractmethod - def defines(self, var): - """Function to set the define commands for c++ code that has to be generated - for cppsim or rtlsim, is member function of HLSCustomOp class but has to - be filled by every node. - - var: makes it possible to reuse the function for different c++ code generation. - I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are - added.""" - pass - - @abstractmethod - def read_npy_data(self): - """Function to generate the commands for reading data from .npy file in c++, - is member function of HLSCustomOp class but has to be filled by every node.""" - pass - - @abstractmethod - def strm_decl(self): - """Function to generate the commands for the stream declaration in c++, - is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def docompute(self): - """Function to generate the commands for the computational part of the - c++ code, is member function of HLSCustomOp class but has to be filled + is member function of HWCustomOp class but has to be filled by every node.""" pass - @abstractmethod - def dataoutstrm(self): - """Function to generate the commands for reading out data from c++ and convert - into npy format, is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def save_as_npy(self): - """Function to generate the commands for saving data in .npy file in c++, - is member function of HLSCustomOp class but has to be filled by every node.""" - pass - - @abstractmethod - def blackboxfunction(self): - """Function to generate a blackbock function in c++ from which an IP block - will be generated, is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def pragmas(self): - """Function to generate the pragma commands in c++, is member function of - HLSCustomOp class but has to be filled by every node.""" - pass - def get_input_datatype(self, ind=0): """Returns FINN DataType of input stream ind.""" raise Exception("get_input_datatype not implemented for this op") @@ -749,15 +361,6 @@ def get_outstream_width_padded(self, ind=0): out_width = self.get_outstream_width(ind=ind) return roundup_to_integer_multiple(out_width, 8) - def get_ap_int_max_w(self): - """Return the maximum width of any ap_int used in this module. Used to set the - AP_INT_MAX_W definition for HLS.""" - instream = self.get_instream_width() - outstream = self.get_outstream_width() - ret = max([instream, outstream]) - assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret - return ret - def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): """Return the unconstrained characteristic functions for this node.""" # ensure rtlsim is ready diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py new file mode 100644 index 0000000000..f4b098cff7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/labelselect.py @@ -0,0 +1,186 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np +import onnxruntime as rt +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model, roundup_to_integer_multiple + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class LabelSelect(HWCustomOp): + """Abstraction layer for HW implementation of LabelSelect""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + odt_name = self.get_nodeattr("outputDataType") + if odt_name == "": + # If not provided compute min size + labels = self.get_nodeattr("Labels") + odt = DataType.get_smallest_possible(labels - 1) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(odt.bitwidth(), 8) + new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) + odt = DataType[new_odt_name] + odt_name = odt.name + self.set_nodeattr("outputDataType", odt_name) + + def get_nodeattr_types(self): + my_attrs = { + "Labels": ("i", True, 0), + "PE": ("i", True, 0), + "K": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + "outputDataType": ("s", False, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + nlabels = self.get_nodeattr("Labels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [nlabels]) + return ishape + + def get_folded_input_shape(self, ind=0): + nlabels = self.get_nodeattr("Labels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert nlabels % pe == 0, "PE must divide Labels" + folds = int(nlabels / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + k = self.get_nodeattr("K") + vecs = list(self.get_nodeattr("numInputVectors")) + oshape = tuple(vecs + [k]) + return oshape + + def get_folded_output_shape(self, ind=0): + k = self.get_nodeattr("K") + vecs = list(self.get_nodeattr("numInputVectors")) + oshape = tuple(vecs + [k, 1]) + return oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + return helper.make_node( + "RandomNormal", + inputs=[], + outputs=[self.onnx_node.output[0]], + mean=0.0, + scale=1.0, + dtype=TensorProto.INT64, + shape=list(oshape), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt = model.get_tensor_datatype(node.input[0]) + self.set_nodeattr("inputDataType", idt.name) + + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + ret = DataType[self.get_nodeattr("outputDataType")] + return ret + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + return self.get_output_datatype().bitwidth() + + def get_number_output_values(self): + return self.get_nodeattr("K") + + def execute_node(self, context, graph): + # create a standard add node to help calculate the result + node = self.onnx_node + k = self.get_nodeattr("K") + + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + k_inp = helper.make_tensor_value_info("k_inp", TensorProto.INT64, [1]) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.INT64, oshape) + val_outp = helper.make_tensor_value_info("val_outp", TensorProto.FLOAT, oshape) + node_topk = helper.make_node( + "TopK", + inputs=[node.input[0], "k_inp"], + outputs=["val_outp", node.output[0]], + ) + graph_topk = helper.make_graph( + nodes=[node_topk], + name="single-add-exec", + inputs=[inp, k_inp], + outputs=[val_outp, outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_topk = qonnx_make_model(graph_topk, **onnx_kwargs) + idict = {node.input[0]: inp_values, "k_inp": [k]} + sess = rt.InferenceSession(model_topk.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result[1], dtype=np.float32).reshape(oshape) + + def get_exp_cycles(self): + nlabels = self.get_nodeattr("Labels") + pe = self.get_nodeattr("PE") + exp_cycles = nlabels / pe + return int(exp_cycles) diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index 2dfca90ed9..ab6228a5d6 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,22 +27,19 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os +import onnxruntime as rt import warnings -from math import ceil, log2 +from math import ceil +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class Lookup(HLSCustomOp): - "Streaming elementwise HLS lookup, mapping indices to values." +class Lookup(HWCustomOp): + """Abstraction layer for HW implementation of streaming elementwise lookup, + mapping indices to values.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -60,9 +57,9 @@ def get_nodeattr_types(self): # Input shape "InputShape": ("ints", False, [1]), # Memory mode - # const : parameters baked into bitfile (BRAM) + # internal_embedded : parameters baked into bitfile (BRAM) # external : lookup performed in external memory over AXI MM - "mem_mode": ("s", False, "const", ["const", "external"]), + "mem_mode": ("s", False, "internal_embedded", ["internal_embedded", "external"]), # Width for AXI-MM interface # only relevant when mem_mode="external" "ext_mem_width": ("i", False, 32), @@ -93,7 +90,7 @@ def get_folded_output_shape(self, ind=0): ishape = self.get_normal_input_shape() mem_mode = self.get_nodeattr("mem_mode") emb_dim = self.get_nodeattr("EmbeddingDim") - if mem_mode == "const": + if mem_mode == "internal_embedded": oshape = list(ishape) + [emb_dim] elif mem_mode == "external": ext_mem_width = self.get_nodeattr("ext_mem_width") @@ -156,301 +153,43 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - mem_mode = self.get_nodeattr("mem_mode") - global_incls = [] - global_incls.append('#include "lookup.hpp"') - if mem_mode == "const": - global_incls.append('#include "embeddings.hpp"') - self.code_gen_dict["$GLOBALS$"] = global_incls - - def defines(self, var): - n_inputs = np.prod(self.get_folded_input_shape()[:-1]) - dtype = self.get_input_datatype() - elem_hls_type = dtype.get_hls_datatype_str() - emb_type = DataType[self.get_nodeattr("EmbeddingType")] - emb_hls_type = emb_type.get_hls_datatype_str() - emb_dim = self.get_nodeattr("EmbeddingDim") - mem_mode = self.get_nodeattr("mem_mode") - my_defines = [] - my_defines.append("#define NumInputs %d" % n_inputs) - if mem_mode == "external": - ext_mem_width = self.get_nodeattr("ext_mem_width") - ext_mem_emb_size = self.get_folded_output_shape()[-2] - ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) - my_defines.append("#define MemBits %d" % ext_mem_width) - my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size) - my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align) - my_defines.append("#define T_SRC %s" % elem_hls_type) - my_defines.append("#define T_DST ap_uint") - elif mem_mode == "const": - my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")) - my_defines.append("#define EmbeddingDim %d" % emb_dim) - my_defines.append("#define InputType %s" % elem_hls_type) - my_defines.append("#define EmbeddingType %s" % emb_hls_type) - self.code_gen_dict["$DEFINES$"] = my_defines - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "int64_t" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", %s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - "false", - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """StreamingLookup(in0_%s, out_%s, embeddings);""" - % (self.hls_sname(), self.hls_sname()) - ] - elif mem_mode == "external": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """StreamingLookup_ext(in0_%s, out_%s, mem, size, oob_count, - oob_irq);""" - % (self.hls_sname(), self.hls_sname()) - ] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - ibits = self.get_instream_width() - packed_input_hls_type = "ap_uint<%d>" % ibits - obits = self.get_outstream_width() - packed_output_hls_type = "ap_uint<%d>" % obits - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_input_hls_type, - self.hls_sname(), - packed_output_hls_type, - self.hls_sname(), - ) - ] - elif mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void " - + self.onnx_node.name - + "(hls::stream &in0_%s, hls::stream &out_%s, " - % (self.hls_sname(), self.hls_sname()) - + "T_DST const *const mem, unsigned const size, " - + "unsigned &oob_count, bool &oob_irq)" - ] - - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()] - my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname()) - my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if mem_mode == "const": - my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM") - elif mem_mode == "external": - my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") - my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") - my_pragmas.append("#pragma HLS INTERFACE s_axilite port=size bundle=control") - my_pragmas.append("#pragma HLS INTERFACE s_axilite port=oob_count bundle=control") - my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") - else: - raise Exception("Unrecognized mem_mode: " + mem_mode) - self.code_gen_dict["$PRAGMAS$"] = my_pragmas - - def generate_params(self, model, path): - mem_mode = self.get_nodeattr("mem_mode") - embeddings = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": - code_gen_dir = path - weight_filename = "{}/embeddings.hpp".format(code_gen_dir) - edt = DataType[self.get_nodeattr("EmbeddingType")] - # obits = self.get_outstream_width() - # packed_output_hls_type = "ap_uint<%d>" % obits - assert np.vectorize(edt.allowed)( - embeddings - ).all(), "Embeddings can't be expressed with type %s" % str(edt) - # reverse innertmost dim in embeddings to remain compatible with - # how we normally encode the data in FINN - embeddings_rev = np.flip(embeddings, -1) - embeddings_hls_code = numpy_to_hls_code(embeddings_rev, edt, "embeddings", True, False) - f_thresh = open(weight_filename, "w") - f_thresh.write(embeddings_hls_code) - f_thresh.close() - elif mem_mode == "external": - edt = DataType[self.get_nodeattr("EmbeddingType")] - ext_mem_width = self.get_nodeattr("ext_mem_width") - assert edt.bitwidth() == 8, ( - "Lookup with mem_mode=external " - + "only works with 8-bit embeddings but found " - + str(edt) - ) - emb_dim = self.get_nodeattr("EmbeddingDim") - # need to zero-pad embeddings in external mode for burst alignment - # compute how much padding we need - emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1] - ext_mem_emb_size = self.get_folded_output_shape()[-2] - ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) - align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align) - pad_amount = align_factor - emb_dim - embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)]) - # reshape for packing the innermost dim - embeddings_padded = embeddings_padded.reshape(-1, emb_elems_per_ext_mem_width) - weight_filename = "%s/%s.dat" % (path, self.onnx_node.name) - ret = pack_innermost_dim_as_hex_string( - embeddings_padded, edt, ext_mem_width, True, prefix="" - ) - with open(weight_filename, "w") as f: - for current_line in ret: - f.write(current_line + "\n") - else: - raise Exception("Unrecognized mem_mode: " + mem_mode) - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # create a standard add node to help calculate the result node = self.onnx_node - exp_ishape = tuple(self.get_normal_input_shape()) - exp_oshape = tuple(self.get_normal_output_shape()) - folded_ishape = tuple(self.get_folded_input_shape()) - folded_oshape = tuple(self.get_folded_output_shape()) - mem_mode = self.get_nodeattr("mem_mode") - assert ( - mem_mode == "const" - ), "Only mem_mode=const is supported for simulation of Lookup layer" - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape.""" - export_idt = self.get_input_datatype() - odt = self.get_output_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + inp_values = context[node.input[0]] + ishape = inp_values.shape + data_values = context[node.input[1]] + dshape = data_values.shape + oshape = context[node.output[0]].shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.INT64, ishape) + data = helper.make_tensor_value_info(node.input[1], TensorProto.FLOAT, dshape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_gather = helper.make_node( + "Gather", + inputs=[node.input[1], node.input[0]], + outputs=[node.output[0]], + ) + graph_gather = helper.make_graph( + nodes=[node_gather], + name="single-gather-exec", + inputs=[data, inp], + outputs=[outp], + ) - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - reverse_inner=True, - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" + opset_version = 13 + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_gather = qonnx_make_model(graph_gather, **onnx_kwargs) + idict = {node.input[0]: inp_values, node.input[1]: data_values} + sess = rt.InferenceSession(model_gather.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) def bram_estimation(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": # current calculation assumes embeddings always stored in BRAM_18Ks - # when mem_mode is const + # when mem_mode is internal_embedded width_factor = ceil(self.get_outstream_width() / 16) depth_factor = ceil(self.get_nodeattr("NumEmbeddings") / 1024) return width_factor * depth_factor @@ -466,15 +205,6 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 18 * 1024 return ebits / bram16_est_capacity - def get_ap_int_max_w(self): - parent_max = super().get_ap_int_max_w() - mem_mode = self.get_nodeattr("mem_mode") - ext_mem_width = self.get_nodeattr("ext_mem_width") - if mem_mode == "external": - return max(ext_mem_width, parent_max) - else: - return parent_max - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 6699340cac..7bbe4c04e9 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,23 +28,20 @@ import math import numpy as np -import os +import onnx.numpy_helper as np_helper +import qonnx.custom_op.general.xnorpopcount as xp import textwrap import warnings from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -54,9 +51,8 @@ # the ... here can be any shape (representing groups of vectors) -class MatrixVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" +class MVAU(HWCustomOp): + """Abstraction layer for HW implementation of MatrixVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -67,7 +63,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "resType": ("s", False, "auto", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -86,11 +82,16 @@ def get_nodeattr_types(self): # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), # memory mode for the FC weights - # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights with weight streamer packaged inside IP + # internal_embedded -- embedded weights, long compile/synth times + # internal_decoupled -- default, streaming weights with streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), - # FPGA resource type for memories in decoupled mode + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled", "external"}, + ), + # FPGA resource type for memories in internal_decoupled mode # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM @@ -112,8 +113,8 @@ def get_nodeattr_types(self): "auto", {"auto", "block", "distributed"}, ), - # (mem_mode = decoupled only) whether weights will be writable through - # an AXI-lite interface during runtime + # (mem_mode = internal_decoupled only) whether weights will be + # writeable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory # address map used for writable weights @@ -126,44 +127,40 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def calc_wmem(self): - """Calculates and returns WMEM.""" - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." - wmem = mw * mh // (pe * simd) - return wmem - - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 - else: - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - return mh // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): + def execute_node(self, context, graph): node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) + in_act = context[node.input[0]] + mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + mvau_w = np_helper.to_array(mvau_w_init) + # Matrix multiplication + if self.get_nodeattr("binaryXnorMode"): + # Note: activation/weights are expected to be binary + # (by design coming from the transformation inferring this operation mode) + result = xp.xnorpopcountmatmul(in_act, mvau_w) + elif ( + self.get_nodeattr("inputDataType") == "BIPOLAR" + and self.get_nodeattr("weightDataType") == "BIPOLAR" + ): + # Convert to binary and use xnorpopcountmatmul function + result = xp.xnorpopcountmatmul((in_act + 1) / 2, (mvau_w + 1) / 2) + else: + # Regular matrix multiplication + result = np.matmul(in_act, mvau_w) + if self.get_nodeattr("noActivation") == 0: + mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + mvau_thr = np_helper.to_array(mvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + if result.ndim == 4: + # NHWC to NCHW for multithreshold node + result = result.transpose((0, 3, 1, 2)) + result = multithreshold(result, mvau_thr, out_scale, out_bias) + if result.ndim == 4: + # NCHW to NHWC + result = result.transpose((0, 2, 3, 1)) + + context[node.output[0]] = result def verify_node(self): info_messages = [] @@ -218,184 +215,26 @@ def verify_node(self): no_act ) ) - return info_messages - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - omega = (D_in * D_out) / (Q * P) - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - omega = (D_in * D_out) / (Q * P) - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # assuming decoupled (RTL) memory, which is more efficient than const (HLS) - if mem_width == 1: - return math.ceil(omega / 16384) - elif mem_width == 2: - return math.ceil(omega / 8192) - elif mem_width <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) - elif mem_width <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) - elif mem_width <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) - else: - return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) - - def bram_efficiency_estimation(self): - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * D_in * D_out - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def uram_efficiency_estimation(self): - """Function for URAM efficiency estimation: actual parameter storage - needed divided by the allocated URAM storage (from estimation)""" - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - uram_est = self.uram_estimation() - if uram_est == 0: - return 1 - wbits = W * D_in * D_out - uram_est_capacity = uram_est * 72 * 4096 - return wbits / uram_est_capacity - - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_datatype = self.get_accumulator_datatype() - # if accDataType is not set, then it will default to INT32, which would - # be a large overestimate in most (if not all) cases. In this scenario, - # we would use the minimum accumulator as determined by the data types - # bound, derived in https://arxiv.org/abs/2301.13376 - alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed()) - acc_bits = min( - acc_datatype.bitwidth(), - np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), - ) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - tmem_style = self.get_nodeattr("ram_style_thresholds") - if (noact == 0) and (tmem_style == "distributed"): - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) - def get_exp_cycles(self): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - num_inp_vec = self.get_nodeattr("numInputVectors") - mh = self.get_nodeattr("MH") - mw = self.get_nodeattr("MW") - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv - return int(exp_cycles) + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" @@ -408,14 +247,14 @@ def get_input_datatype(self, ind=0): else: raise Exception("Undefined input ind for this layer type") - def get_accumulator_datatype(self): - """Returns FINN DataType of accumulator""" - return DataType[self.get_nodeattr("accDataType")] - def get_weight_datatype(self): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] + def get_accumulator_datatype(self): + """Returns FINN DataType of accumulator""" + return DataType[self.get_nodeattr("accDataType")] + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] @@ -431,9 +270,10 @@ def get_outstream_width(self, ind=0): return out_width def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" + """Returns weight stream width. + Used only in internal_decoupled and external mode.""" if ( - self.get_nodeattr("mem_mode") == "decoupled" + self.get_nodeattr("mem_mode") == "internal_decoupled" or self.get_nodeattr("mem_mode") == "external" ): pe = self.get_nodeattr("PE") @@ -446,21 +286,10 @@ def get_weightstream_width(self): def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" + by the AXI Stream spec. Used in internal_decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) - def get_ap_int_max_w(self): - # base class impl (max of inp/out stream widths) - max_of_io = super().get_ap_int_max_w() - # decoupled mode weight stream - weightstream = self.get_weightstream_width() - # single PE weight entry - weight_bits = self.get_weight_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - single_pe_w = simd * weight_bits - return max([weightstream, max_of_io, single_pe_w]) - def get_folded_input_shape(self, ind=0): mw = self.get_nodeattr("MW") mh = self.get_nodeattr("MH") @@ -505,81 +334,124 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" + def calc_wmem(self): + """Calculates and returns WMEM.""" + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." + wmem = mw * mh // (pe * simd) + return wmem - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + return mh // pe - return ret + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle != "ultra") + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 and MW % SIMD == 0 - * for bipolar {-1,+1} weights, convert to binary {0, 1} - * interleave rows between PEs - * reshape into (1, PE, WMEM, SIMD) and return + def bram_estimation(self): + """Calculates resource estimation for BRAM based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 """ - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # assuming internal_decoupled (RTL) memory, + # which is more efficient than internal_embedded (HLS) + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) + + def bram_efficiency_estimation(self): + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * D_in * D_out + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + + def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - mw, - mh, - ), """Weights matrix doesn't - have expected shape (mw, mh)""" - assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - # start by transposing the original weight matrix, since ONNX and - # finn-hlslib use different assumptions - # ONNX uses (in_features, out_features) and matmul(x, W) - # finn-hlslib uses (out_features, in_features) and matmul(W, x) - ret = orig_weight_matrix.T - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - # interleave rows between PEs and reshape - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - # create SIMD as innermost dimension and add a dummy outer dim - ret = ret.reshape(1, pe, wmem, simd) - # reverse the SIMD dimension - ret = np.flip(ret, axis=-1) - return ret + num_inp_vec = self.get_nodeattr("numInputVectors") + mh = self.get_nodeattr("MH") + mw = self.get_nodeattr("MW") + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv + return int(exp_cycles) def minimize_accumulator_width(self, model): """Minimize the accumulator bit width according to the weight values, @@ -611,7 +483,7 @@ def minimize_accumulator_width(self, model): # if the thresholds can be used to determine range, then adjust the range # according to the known values of the thresholds if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # set threshold datatype (and accumulator datatype implicitly) min_threshold = thresholds.min() max_threshold = thresholds.max() @@ -620,7 +492,7 @@ def minimize_accumulator_width(self, model): warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) thresholds = np.clip(thresholds, acc_min, acc_max) model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) min_threshold = thresholds.min() max_threshold = thresholds.max() acc_min = min(min_threshold, acc_min) @@ -677,7 +549,7 @@ def minimize_weight_bit_width(self, model): self.set_nodeattr("weightDataType", wdt.name) return DataType[self.get_nodeattr("weightDataType")] - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: * ensure MH % PE == 0 @@ -728,6 +600,43 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hw_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -741,8 +650,8 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ - # convert weights into hlslib-compatible format - weight_tensor = self.get_hls_compatible_weight_tensor(weights) + # convert weights into hlslib/rtllib-compatible format + weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation @@ -772,7 +681,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): f_weights.write(weight_hls_code) f_weights.close() elif "decoupled" in weight_file_mode: - # create a weight stream for various flavors of decoupled mode: + # create a weight stream for various flavors of internal_decoupled mode: # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # reverse SIMD flip for saving weights in .npy @@ -837,22 +746,22 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": + if mem_mode == "internal_embedded": # save hlslib-compatible weights in params.h weight_filename = "{}/params.h".format(code_gen_dir) self.make_weight_file(weights, "hls_header", weight_filename) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": weight_filename_sim = "{}/weights.npy".format(code_gen_dir) - # save decoupled weights for cppsim + # save internal_decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # also save weights as Verilog .dat file # This file will be ignored when synthesizing UltraScale memory. weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -860,7 +769,7 @@ def generate_params(self, model, path): if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] @@ -905,408 +814,63 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. - if var == "ipgen": - SIMD = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - condition = SIMD >= (MW / 1024) - msg = ( - f"HLS synthesis of MatrixVectorActivation requires: " - f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " - f"and MW={MW} for node: {self.onnx_node.name}." - ) - assert condition, msg - mem_mode = self.get_nodeattr("mem_mode") - numInputVectors = list(self.get_nodeattr("numInputVectors")) - numReps = np.prod(numInputVectors) - self.code_gen_dict["$DEFINES$"] = [ - """#define MW1 {}\n #define MH1 {}\n - #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n - #define TMEM1 {}\n #define numReps {}""".format( - self.get_nodeattr("MW"), - self.get_nodeattr("MH"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - self.calc_wmem(), - self.calc_tmem(), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Stream_Batch - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) + if mem_mode in ["internal_decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def pragmas(self): + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) - - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES - # dimensions (dims 1 and 3) - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - # add resource pragma for thresholds if set - if ram_style_thresholds == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") - ) - elif ram_style_thresholds == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") - ) - elif ram_style_thresholds == "auto": - # no pragma needed - pass - else: - raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "internal_decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if self.get_nodeattr("ram_style") == "ultra": assert ( @@ -1330,11 +894,9 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # Instantiate either the HLS or RTL IP depending on operator + self.instantiate_ip(cmd) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -1403,61 +965,9 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + elif mem_mode == "internal_embedded" or mem_mode == "external": + # base class impl sufficient for internal_embedded/external modes + self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def get_op_and_param_counts(self): - in_features = self.get_nodeattr("MW") - out_features = self.get_nodeattr("MH") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_inp_vec = self.get_nodeattr("numInputVectors") - num_repetitions = int(np.prod(num_inp_vec)) - mac_count = in_features * out_features * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = in_features * out_features - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/pool.py b/src/finn/custom_op/fpgadataflow/pool.py new file mode 100644 index 0000000000..35aee023b9 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pool.py @@ -0,0 +1,224 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class Pool(HWCustomOp): + """Abstraction layer for HW implementation of Pool. + Requires ConvolutionInputGenerator(depthwise == 1) to format its input + + Input shape (BatchSize,OutImgDim,OutImgDim,TotalKernelSize*Channels) + Output shape (BatchSize,OutImgDim,OutImgDim,Channels) + + Notes: + + * The input shape was chosen to be compatible with im2col (only true when there + is not folding). + * The actual data layout produced by the hlslib kernels is different + for depthwise ops. + + * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + + Channels can be folded using PE (SIMD from the input perspective) + """ + + def get_nodeattr_types(self): + my_attrs = { + "Channels": ("i", True, 0), + "PE": ("i", True, 1), + "KernelSize": ("ints", True, []), + # Function: + # - MaxPool + # - QuantAvgPool + # TODO add support for AvgPool and AccPool + "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}), + "OutImgDims": ("ints", True, []), + # FINN DataTypes for inputs/outputs + "InputDataType": ("s", True, ""), + "OutputDataType": ("s", True, ""), + "AccumBits": ("i", False, 0), + "Size": ("i", False, 1), + "BatchSize": ("i", False, 1), + } + + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("InputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + fxn = self.get_nodeattr("Function") + odt = DataType[self.get_nodeattr("OutputDataType")] + + if fxn == "MaxPool": + # Same as input + idt = DataType[self.get_nodeattr("InputDataType")] + assert odt == idt, "In datatype must be equal to out datatype for Maxpool" + elif fxn == "QuantAvgPool": + idt = DataType[self.get_nodeattr("InputDataType")] + assert ( + idt.signed() == odt.signed() + ), """QuantAvgPool: Can't mix signed + and unsigned datatypes""" + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + return odt + + def get_normal_input_shape(self, ind=0): + ifm_ch = self.get_nodeattr("Channels") + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + k = self.get_nodeattr("KernelSize") + k_prod = int(np.prod(k)) + ishape = (batch_size, *odims, k_prod * ifm_ch) + return ishape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(normal_ishape[-1] / pe) + folded_ishape = normal_ishape[:-1] + [fold, pe] + return tuple(folded_ishape) + + def get_normal_output_shape(self, ind=0): + ofm_ch = self.get_nodeattr("Channels") + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + oshape = (batch_size, *odims, ofm_ch) + return oshape + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(ifm_ch / pe) + folded_oshape = normal_oshape[:-1] + [fold, pe] + return tuple(folded_oshape) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[1:-1]) + + def get_exp_cycles(self): + # (Channels * kernel * kernel) / PE * odim * odim * batch_size + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + k = self.get_nodeattr("KernelSize") + k_prod = int(np.prod(k)) + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size + return int(exp_cycles) + + def get_instream_width(self, ind=0): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = int(dt_bits * pe) + return in_width + + def get_outstream_width(self, ind=0): + dt_bits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = int(dt_bits * pe) + return out_width + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""Pool_Batch needs 1 data input""") + + # check supported function + fnx = self.get_nodeattr("Function") + if fnx in ["MaxPool", "QuantAvgPool"]: + info_messages.append("Attribute Function contains a supported pool function") + else: + info_messages.append("Attribute Function contains an unsupported pool function") + return info_messages + + def execute_node(self, context, graph): + # simulate behavior with Python functionality + node = self.onnx_node + fnx = self.get_nodeattr("Function") + k = self.get_nodeattr("KernelSize") + ch = self.get_nodeattr("Channels") + k2 = k[0] * k[1] + + inp_values = context[node.input[0]] + ishape = inp_values.shape + # reshape array to apply max or avg function only on kernel + tmp_shape = tuple(list(ishape)[:-1] + [k2, ch]) + tmp_values = inp_values.reshape(tmp_shape) + if fnx == "MaxPool": + result = np.max(tmp_values, axis=3) + elif fnx == "QuantAvgPool": + # determine bits to shift + ibits = self.get_input_datatype().bitwidth() + obits = self.get_output_datatype().bitwidth() + max_value = 2**ibits - 1 + max_value = max_value * k2 + max_bit_width = int(max_value).bit_length() + shift_bits = max_bit_width - obits + shift_bits = shift_bits if shift_bits >= 0 else 0 + result = np.sum(tmp_values, axis=3) + result = np.right_shift(result.astype(int), shift_bits) + oshape = context[node.output[0]].shape + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py new file mode 100644 index 0000000000..06067a4fca --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -0,0 +1,51 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl import ( + ConvolutionInputGenerator_rtl, +) +from finn.custom_op.fpgadataflow.rtl.fmpadding_rtl import FMPadding_rtl +from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl +from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( + StreamingDataWidthConverter_rtl, +) +from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl +from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl +from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl + +custom_op = dict() + +# make sure new HLSCustomOp subclasses are imported here so that they get +# registered and plug in correctly into the infrastructure +custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl +custom_op["FMPadding_rtl"] = FMPadding_rtl +custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl +custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl +custom_op["MVAU_rtl"] = MVAU_rtl +custom_op["VVAU_rtl"] = VVAU_rtl +custom_op["Thresholding_rtl"] = Thresholding_rtl diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py similarity index 80% rename from src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py rename to src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index d3e5576354..321522e7ba 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,9 +33,13 @@ from qonnx.core.datatype import DataType from qonnx.custom_op.general import im2col from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( + ConvolutionInputGenerator, +) +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -57,169 +61,31 @@ # NOTE: "Parallel" implementation style not yet implemented in this version! -class ConvolutionInputGenerator_rtl(HLSCustomOp): - """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator - (sliding window) function variants. Generates an RTL ConvolutionInputGenerator - implementation based on (System-)Verilog templates, defined in finn-rtllib/swg.""" +class ConvolutionInputGenerator_rtl(ConvolutionInputGenerator, RTLBackend): + """Class that corresponds to finn-rtllib swg module. + Generates an RTL ConvolutionInputGenerator implementation + based on (System-)Verilog templates, defined in finn-rtllib/swg.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { - "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] - "IFMChannels": ("i", True, 0), - "IFMDim": ("ints", True, []), # [H, W] = [Y, X] - "OFMDim": ("ints", True, []), # [H, W] = [Y, X] - "SIMD": ("i", True, 0), # additional parallelization parameter - not yet implemented "M": ("i", False, 1), - # Enable parallel window output (requires full SIMD unfolding) - "parallel_window": ("i", False, 0, {0, 1}), - "Stride": ("ints", True, []), # [H, W] = [Y, X] - "Dilation": ("ints", True, []), # [H, W] = [Y, X] - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - "depthwise": ("i", False, 0, {0, 1}), - # Enable reprogrammable implementation to change FM dimensions, - # stride, or dilation during runtime (requires parallel_window = 0) - "dynamic_mode": ("i", False, 0, {0, 1}), - # FPGA resource type for ConvolutionInputGenerator input buffer - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use URAM - "ram_style": ( - "s", - False, - "auto", - {"auto", "block", "distributed", "ultra"}, - ), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - simd = self.get_nodeattr("SIMD") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - if self.get_nodeattr("parallel_window"): - wf = int((ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) - else: - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) - return folded_oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = model.get_tensor_datatype(node.input[0]) - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - in_width = simd * ibits - return in_width - - def get_outstream_width(self, ind=0): - if self.get_nodeattr("parallel_window"): - # feed all window pixels in parallel - k_h, k_w = self.get_nodeattr("ConvKernelDim") - return self.get_instream_width() * k_h * k_w - else: - # if parallel variant not in use: same width for output and input stream - return self.get_instream_width() - def get_number_input_values(self): """Function to get the number of expected input values.""" folded_ishape = self.get_folded_input_shape() num_input_elems = np.prod(folded_ishape[:-1]) return num_input_elems - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - num_output_elems = np.prod(folded_oshape[:-1]) - return num_output_elems - - def get_1d_conv_attrs_normalized(self): - """Returns normalized spatial attributes, where H=1 for the 1D case.""" - # normalize FM dimensions so that: - # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. - # The dummy ('1') dimension is the Y-dimension. - ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") - dilation = self.get_nodeattr("Dilation") - - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - ofm_dim = ofm_dim[::-1] - k = k[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + def use_parallel_window_output(self): + return self.get_nodeattr("parallel_window") def get_buffer_depth(self): """Returns total depth of the internal buffer, depending on @@ -421,15 +287,79 @@ def uram_estimation(self): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception("cppsim not possible for RTL SWG, please set exec_mode to rtlsim") + ConvolutionInputGenerator.execute_node(self, context, graph) + # if depthwise = 1 + # interleave channels such that cppsim of ConvolutionInputGenerator_rtl + # has a notion of SIMD parallelism. Subsequent VVAU_{hls/rtl} expects + # the channels to be interleaved (i.e. to match their PE parallelism). + if self.get_nodeattr("depthwise"): + node = self.onnx_node + im2col_out = context[node.output[0]] + simd = getCustomOp(node).get_nodeattr("SIMD") + ofm_h, ofm_w = getCustomOp(node).get_nodeattr("OFMDim") + k_h, k_w = getCustomOp(node).get_nodeattr("ConvKernelDim") + ifm_ch = getCustomOp(node).get_nodeattr("IFMChannels") + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, k_h * k_w, ifm_ch // simd, simd) + im2col_out = im2col_out.transpose(0, 1, 2, 4, 3, 5) + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, ifm_ch * k_h * k_w) + context[node.output[0]] = im2col_out elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -438,51 +368,6 @@ def execute_node(self, context, graph): ) ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" - def prepare_codegen_default(self): """Fills code generation dict for the default implementation style by computing the incremental addressing scheme for the circular buffer.""" @@ -971,7 +856,7 @@ def select_impl_style(self): return impl_style - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): """Generates HDL code and wrapper for the IP, depending on required implementation style.""" impl_style = self.select_impl_style() @@ -1177,55 +1062,3 @@ def get_dynamic_config(self, ifm_dim=None, stride=None, dilation=None): "cfg_last_write": (15 * 4, int(code_gen_dict["$LAST_WRITE_ELEM$"][0])), } return config - - def code_generation_ipgen(self, model, fpgapart, clk): - """Generates (System-)Verilog code for IP generation (instead of HLS code).""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Not implemented (RTL component).""" - pass - - def code_generation_cppsim(self, model): - """Not implemented (RTL component).""" - pass - - def compile_singlenode_code(self): - """Not implemented (RTL component).""" - pass - - def global_includes(self): - """Not implemented (RTL component).""" - pass - - def defines(self, var): - """Not implemented (RTL component).""" - pass - - def read_npy_data(self): - """Not implemented (RTL component).""" - pass - - def strm_decl(self): - """Not implemented (RTL component).""" - pass - - def docompute(self): - """Not implemented (RTL component).""" - pass - - def dataoutstrm(self): - """Not implemented (RTL component).""" - pass - - def save_as_npy(self): - """Not implemented (RTL component).""" - pass - - def blackboxfunction(self): - """Not implemented (RTL component).""" - pass - - def pragmas(self): - """Not implemented (RTL component).""" - pass diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py similarity index 53% rename from src/finn/custom_op/fpgadataflow/fmpadding_rtl.py rename to src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index d79c214730..cc49446ea3 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,11 +30,10 @@ import numpy as np import os import shutil -import warnings -from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -44,7 +43,7 @@ PyVerilator = None -class FMPadding_rtl(HLSCustomOp): +class FMPadding_rtl(FMPadding, RTLBackend): """CustomOp wrapper for the finn-rtllib fmpadding_axi component Supports adjusting the padding amount and spatial feature sizes at runtime.""" @@ -54,129 +53,14 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { - # spatial size of input images - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - # total padding (per dimension) to apply - "Padding": ( - "ints", - True, - [1, 1, 1, 1], - ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] - # number of channels in input image - "NumChannels": ("i", True, 0), - # SIMD Input parallelism - "SIMD": ("i", False, 1), - # FINN input datatype - "inputDataType": ("s", True, ""), - # shape describing input vecs per execution - "numInputVectors": ("i", False, 1), # Enable reprogrammable implementation to change FM dimensions, # stride, or dilation during runtime "dynamic_mode": ("i", False, 0, {0, 1}), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(FMPadding.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs - def get_padded_odim(self): - "Return the padded spatial size of the output." - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - odim_h = idim_h + pad_h - odim_w = idim_w + pad_w - return [odim_h, odim_w] - - def get_exp_cycles(self): - odim_h, odim_w = self.get_padded_odim() - channels = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = (channels / simd) * batch_size * odim_h * odim_w - return int(exp_cycles) - - def get_normal_input_shape(self, ind=0): - idim_h, idim_w = self.get_nodeattr("ImgDim") - num_ch = self.get_nodeattr("NumChannels") - ishape = (1, idim_h, idim_w, num_ch) - return ishape - - def get_normal_output_shape(self, ind=0): - odim_h, odim_w = self.get_padded_odim() - num_ch = self.get_nodeattr("NumChannels") - - oshape = (1, odim_h, odim_w, num_ch) - return oshape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_ishape[-1] / simd) - folded_ishape = normal_ishape[:-1] + [fold, simd] - return tuple(folded_ishape) - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_oshape[-1] / simd) - folded_oshape = normal_oshape[:-1] + [fold, simd] - return tuple(folded_oshape) - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - # the hlslib op always pads with zeros, so ensure that the DataType - # is able to represent zeros - assert ret.allowed(0), "FMPadding_rtl DataType must support zero" - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output. (Same as input datatype)""" - return self.get_input_datatype() - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return ibits * simd - - def get_outstream_width(self, ind=0): - obits = self.get_output_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return obits * simd - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - def get_verilog_top_module_intf_names(self): # Overload default HLSCustomOp implementation to add axilite control IF intf_names = super().get_verilog_top_module_intf_names() @@ -186,15 +70,52 @@ def get_verilog_top_module_intf_names(self): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception("cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim") + FMPadding.execute_node(self, context, graph) elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" + else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -203,39 +124,6 @@ def execute_node(self, context, graph): ) ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" - def get_template_values(self, ifm_dims, pads, chans, simd, idt): dimY, dimX = ifm_dims padT, padL, padB, padR = pads @@ -283,7 +171,7 @@ def get_dynamic_config(self, ifm_dims=None, pads=None): } return config - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl" template_path = rtlsrc + "/fmpadding_template.v" dims = self.get_nodeattr("ImgDim") @@ -369,46 +257,3 @@ def code_generation_ipi(self): % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) ] return cmd - - def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates C++ code and tcl script for IP generation. - Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - pass - - def code_generation_cppsim(self, model): - """Normally: Generates C++ code for simulation (cppsim).""" - pass - - def compile_singlenode_code(self): - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py new file mode 100644 index 0000000000..d48b3a918d --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -0,0 +1,292 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk + +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation_rtl: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MVAU_rtl(MVAU, RTLBackend): + """Class that corresponds to finn-rtl Matrix Vector Unit.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # Flag to indicate if Versal device is targeted + "is_versal": ("i", False, 0, {0, 1}), + } + my_attrs.update(MVAU.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + MVAU.execute_node(self, context, graph) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 1: + raise Exception("Unexpected input found for MatrixVectorActivation_rtl") + in_ind += 1 + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + reset_rtlsim(sim) + toggle_clk(sim) + if mem_mode in ["external", "internal_decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def lut_estimation(self): + return 0 + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + if self.get_nodeattr("is_versal"): + mult_dsp = P * np.ceil(Q / 3) + else: + mult_dsp = np.ceil(P / 4) * Q + return int(mult_dsp) + + def instantiate_ip(self, cmd): + # instantiate the RTL IP + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert ( + clk > 0.741 + ), """Infeasible clk target of {} ns has been set, + consider lowering the targeted clock frequency!""".format( + clk + ) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len + return dsp_chain_len + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the + # supported RTL compute core + assert ( + self.get_nodeattr("resType") != "lut" + ), """LUT-based RTL-MVU implementation currently not supported! + Please change resType for {} to 'dsp' or consider switching to HLS-based MVAU!""".format( + self.onnx_node.name + ) + + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal_family = self.get_nodeattr("is_versal") + + if is_versal_family: + return "mvu_vvu_8sx9_dsp58" + else: + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + if (act_width == 4 and weight_width == 4) and not (is_versal_family): + return "mvu_4sx4u" + else: + return "mvu_8sx8u_dsp48" + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_codegen_default(self, fpgapart, clk): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(1)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + + return template_path, code_gen_dict + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py new file mode 100644 index 0000000000..e79782eb6d --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -0,0 +1,218 @@ +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import shutil + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, +) +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class StreamingDataWidthConverter_rtl(StreamingDataWidthConverter, RTLBackend): + """Class that corresponds to finn-rtllib datawidth converter + module.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def check_divisible_iowidths(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + # the rtl module only supports + # stream widths that are divisible by + # integer width ratios + iwidth_d = iwidth % owidth == 0 + owidth_d = owidth % iwidth == 0 + assert ( + iwidth_d or owidth_d + ), """RTL implementation of DWC requires + stream widths that are integer width ratios + from each other. Input width is set to %s + and output width is set to %s """ % ( + iwidth, + owidth, + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + if mode == "cppsim": + StreamingDataWidthConverter.execute_node(self, context, graph) + elif mode == "rtlsim": + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple( + exp_ishape + ), """Input shape doesn't + match expected shape.""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + assert context[node.output[0]].shape == tuple( + exp_oshape + ), """Output shape doesn't match expected shape.""" + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def get_template_values(self): + topname = self.get_verilog_top_module_name() + ibits = self.get_instream_width() + obits = self.get_outstream_width() + code_gen_dict = { + "IBITS": int(ibits), + "OBITS": int(obits), + "TOP_MODULE_NAME": topname, + } + return code_gen_dict + + def generate_hdl(self, model, fpgapart, clk): + rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/hdl" + template_path = rtlsrc + "/dwc_template.v" + code_gen_dict = self.get_template_values() + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key_name in code_gen_dict: + key = "$%s$" % key_name + template = template.replace(key, str(code_gen_dict[key_name])) + + with open( + os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), + "w", + ) as f: + f.write(template) + + sv_files = ["dwc_axi.sv", "dwc.sv"] + for sv_file in sv_files: + shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir) + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + "dwc_axi.sv", + "dwc.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + sourcefiles = [ + "dwc_axi.sv", + "dwc.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % (f)] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py new file mode 100644 index 0000000000..dfae607622 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py @@ -0,0 +1,283 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np +import os +import shutil +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class StreamingFIFO_rtl(StreamingFIFO, RTLBackend): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # Toggle between rtl or IPI implementation + # rtl - use the rtl generated IP during stitching + # vivado - use the AXI Infrastructure FIFO + "impl_style": ("s", False, "rtl", {"rtl", "vivado"}), + } + my_attrs.update(StreamingFIFO.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + + return my_attrs + + def get_adjusted_depth(self): + impl = self.get_nodeattr("impl_style") + depth = self.get_nodeattr("depth") + if impl == "vivado": + old_depth = depth + # round up depth to nearest power-of-2 + # Vivado FIFO impl may fail otherwise + depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth + if old_depth != depth: + warnings.warn( + "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" + % (self.onnx_node.name, old_depth, depth) + ) + + return depth + + def get_verilog_top_module_intf_names(self): + ret = super().get_verilog_top_module_intf_names() + is_rtl = self.get_nodeattr("impl_style") == "rtl" + is_depth_monitor = self.get_nodeattr("depth_monitor") == 1 + if is_rtl and is_depth_monitor: + ret["ap_none"] = ["maxcount"] + return ret + + def generate_hdl(self, model, fpgapart, clk): + rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fifo/hdl" + template_path = rtlsrc + "/fifo_template.v" + + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + topname = self.get_verilog_top_module_name() + self.set_nodeattr("gen_top_module", topname) + + code_gen_dict = {} + code_gen_dict["$TOP_MODULE_NAME$"] = topname + # make instream width a multiple of 8 for axi interface + in_width = self.get_instream_width_padded() + count_width = int(self.get_nodeattr("depth") - 1).bit_length() + code_gen_dict["$COUNT_RANGE$"] = "[{}:0]".format(count_width - 1) + code_gen_dict["$IN_RANGE$"] = "[{}:0]".format(in_width - 1) + code_gen_dict["$OUT_RANGE$"] = "[{}:0]".format(in_width - 1) + code_gen_dict["$WIDTH$"] = str(in_width) + code_gen_dict["$DEPTH$"] = str(self.get_nodeattr("depth")) + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key_name in code_gen_dict: + key = "%s" % key_name + template = template.replace(key, str(code_gen_dict[key_name])) + with open( + os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), + "w", + ) as f: + f.write(template) + + shutil.copy(rtlsrc + "/Q_srl.v", code_gen_dir) + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + inp = context[node.input[0]] + exp_shape = self.get_normal_input_shape() + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file for the input of the node + assert ( + str(inp.dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = inp.reshape(expected_inp_shape) + if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = DataType[self.get_nodeattr("dataType")] + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = DataType[self.get_nodeattr("dataType")] + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipi(self): + impl_style = self.get_nodeattr("impl_style") + if impl_style == "rtl": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + sourcefiles = [ + "Q_srl.v", + self.get_nodeattr("gen_top_module") + ".v", + ] + + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % (f)] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd + elif impl_style == "vivado": + cmd = [] + node_name = self.onnx_node.name + depth = self.get_adjusted_depth() + ram_style = self.get_nodeattr("ram_style") + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate and configure DWC + cmd.append( + "create_bd_cell -type ip " + "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name + ) + cmd.append( + "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] " + "[get_bd_cells /%s/fifo]" % (depth, node_name) + ) + cmd.append( + "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] " + "[get_bd_cells /%s/fifo]" % (ram_style, node_name) + ) + cmd.append( + "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] " + "[get_bd_cells /%s/fifo]" % (np.ceil(self.get_outstream_width() / 8), node_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] " + "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] " + "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] " + "[get_bd_pins %s/fifo/s_axis_aresetn]" % (node_name, rst_name, node_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] " + "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name) + ) + return cmd + else: + raise Exception( + "FIFO implementation style %s not supported, please use rtl or vivado" % impl_style + ) + + def prepare_rtlsim(self): + assert self.get_nodeattr("impl_style") != "vivado", ( + "StreamingFIFO impl_style " + "cannot be vivado for rtlsim. Only impl_style=rtl supported." + ) + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + "Q_srl.v", + self.get_nodeattr("gen_top_module") + ".v", + ] + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py new file mode 100644 index 0000000000..67b41d0165 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -0,0 +1,559 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import shutil +from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io +from qonnx.core.datatype import DataType +from qonnx.util.basic import roundup_to_integer_multiple + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.thresholding import Thresholding +from finn.util.basic import ( + get_memutil_alternatives, + get_rtlsim_trace_depth, + make_build_dir, + mem_primitives_versal, + pyverilate_get_liveness_threshold_cycles, +) +from finn.util.data_packing import ( + npy_to_rtlsim_input, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class Thresholding_rtl(Thresholding, RTLBackend): + """Class that corresponds to finn-rtllib 'thresholding' function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # memory depth triggers for threshold storage + "depth_trigger_uram": ("i", False, 0), + "depth_trigger_bram": ("i", False, 0), + # enable uniform thres optimization + # doesn't actually do anything yet, only + # for resource estimations + "uniform_thres": ("i", False, 0, {0, 1}), + # enable deep pipelining for easier timing closure + # setting to 0 may save some FFs but otherwise leave on + "deep_pipeline": ("i", False, 1, {0, 1}), + } + my_attrs.update(Thresholding.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def get_pe_mem_geometries(self): + """return a list of (bitwidth, depth) for PE memory configurations to be used + in resource estimation + + for each bitwidth, the depth is calculated as the + number of thresholds that can be stored in a single + memory block + the bitwidth is the bitwidth of the threshold values + the depth is the number of thresholds that can be stored + in a single memory block + the number of memory blocks is calculated as the number + of thresholds divided by the depth + the number of memory blocks is then multiplied by the + number of PEs to get the total number of memory blocks + required for the entire layer + """ + pe = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + wdt_bits = wdt.bitwidth() + odt = self.get_output_datatype() + odt_bits = odt.bitwidth() + t_channels = self.get_nodeattr("NumChannels") + cf = t_channels / pe + is_uniform = self.get_nodeattr("uniform_thres") + if is_uniform: + ret = [(odt_bits - x, cf * (2**x)) for x in range(1, odt_bits)] + else: + ret = [(wdt_bits, (cf) * 2**x) for x in range(odt_bits)] + return ret + + def get_memory_estimate(self): + """return the memory estimate for this node""" + res_dict = {} + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + pe = self.get_nodeattr("PE") + ret = self.get_pe_mem_geometries() + for mem_cfg in ret: + (width, depth) = mem_cfg + primitives = mem_primitives_versal + if depth_trigger_bram != 0 or depth_trigger_uram != 0: + if depth >= depth_trigger_bram and depth < depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "BRAM" in k} + elif depth >= depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "URAM" in k} + alts = get_memutil_alternatives(mem_cfg, primitives) + primary_alt = alts[0] + res_type = primary_alt[0].split("_")[0] + res_count, eff, waste = primary_alt[1] + res_dict[res_type] = res_dict.get(res_type, 0) + pe * res_count + return res_dict + + def bram_estimation(self): + """return the number of BRAMs required for this node""" + res_dict = self.get_memory_estimate() + return res_dict.get("BRAM", 0) + + def uram_estimation(self): + """return the number of URAMs required for this node""" + res_dict = self.get_memory_estimate() + return res_dict.get("URAM", 0) + + def lut_estimation(self): + """return the number of LUTs required for this node""" + res_dict = self.get_memory_estimate() + return res_dict.get("LUTRAM", 0) + + def get_all_meminit_filenames(self, abspath=False): + "Return a list of all .dat memory initializer files used for this node" + dat_files = [] + t_path = self.get_nodeattr("code_gen_dir_ipgen") if abspath else "." + pe = self.get_nodeattr("PE") + output_data_type = self.get_nodeattr("outputDataType") # output precision + o_bitwidth = DataType[output_data_type].bitwidth() + for stage in range(o_bitwidth): + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + dat_files.append(thresh_file) + return dat_files + + def prepare_codegen_rtl_values(self, model): + """All dictionary values produced in this function are to replace + their key value(s) in the RTL template files""" + code_gen_dict = {} + + # TODO check for sortedness and size here? + thresholds = model.get_initializer(self.onnx_node.input[1]) + bias = self.get_nodeattr("ActVal") # activation bias value + output_data_type = self.get_nodeattr("outputDataType") # output precision + input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision + o_bitwidth = DataType[output_data_type].bitwidth() + + # The RTL expects 2^N-1 thresholds, but narrow range quantization will result in + # one less threshold, prepending a dummy threshold and reducing bias by 1 to compensate. + expected_thresholds = 2**o_bitwidth - 1 + n_thres_steps = self.get_nodeattr("numSteps") + if expected_thresholds != n_thres_steps and DataType[input_data_type].signed() is not True: + min_val = np.amin(thresholds, axis=1) + thresholds = np.insert(thresholds, 0, min_val, axis=1) + bias = bias - 1 + + # add dummy dimension as final dimension (that's what gets packed with next call) + thresholds = np.expand_dims(thresholds, axis=-1) + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) + t_packed = pack_innermost_dim_as_hex_string( + thresholds, + wdt, + bw_hexdigit, + prefix="", + ) + + t_path = self.get_nodeattr("code_gen_dir_ipgen") + pe = self.get_nodeattr("PE") + num_channels = self.get_nodeattr("NumChannels") # number of channels + + # If a single threshold value is found, broadcast the value + expected_shape = (num_channels, n_thres_steps) + if t_packed.shape == (1, 1): + t_packed = np.broadcast_to(t_packed, expected_shape) + + channel_fold = int(num_channels / pe) + + for stage in range(o_bitwidth): + sn = o_bitwidth - stage - 1 + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + threshs = np.zeros([channel_fold * (2**stage)], dtype="object") + for ch in range(channel_fold): + for i in range(2**stage): + threshs[(ch << stage) + i] = t_packed[ch * pe + pe_value][ + (i << (o_bitwidth - stage)) + 2**sn - 1 + ] + with open(thresh_file, "w") as f: + for val in threshs: + f.write(val + "\n") + code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name] + + # Identify the module name + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + "_axi_wrapper" + ] + # Set the top module name - AXI wrapper + code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] + + # Identify the module variables + i_bitwidth = DataType[input_data_type].bitwidth() + + code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string + code_gen_dict["$M$"] = [ + str(i_bitwidth) + ] # input/threshold precision - convert bitwidth to string + code_gen_dict["$C$"] = [str(num_channels)] # number of channels + code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value + code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE + + # Is the input datatype signed or unsigned? + # The thresholding core needs to know this when comparing weights to inputs + if self.get_input_datatype().signed(): + code_gen_dict["$SIGNED$"] = [str(1)] + else: + code_gen_dict["$SIGNED$"] = [str(0)] + + if bias >= 0: + o_bits = math.ceil(math.log2(2**o_bitwidth + bias)) + else: + o_bits = 1 + math.ceil( + math.log2(-bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias) + ) + + code_gen_dict["$O_BITS$"] = [str(int(o_bits))] + + rt_weights = self.get_nodeattr("runtime_writeable_weights") + code_gen_dict["$USE_AXILITE$"] = [str(rt_weights)] + + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + deep_pipeline = self.get_nodeattr("deep_pipeline") + code_gen_dict["$DEPTH_TRIGGER_URAM$"] = [str(depth_trigger_uram)] + code_gen_dict["$DEPTH_TRIGGER_BRAM$"] = [str(depth_trigger_bram)] + code_gen_dict["$DEEP_PIPELINE$"] = [str(deep_pipeline)] + return code_gen_dict + + def get_rtl_file_list(self): + """Thresholding binary search RTL file list""" + return [ + "axilite_if.v", + "thresholding.sv", + "thresholding_axi.sv", + "thresholding_template_wrapper.v", + ] + + def get_rtl_file_paths(self): + """Get full path of all RTL files""" + rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" + rtl_file_list = self.get_rtl_file_list() + rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] + return rtl_file_paths + + def get_rtl_template_data(self, path): + """Return RTL file contents as a template""" + with open(path, "r") as f: + template = f.read() + return template + + def fill_in_rtl_template_data(self, replace_dict, template_data): + """Use attribute values to finn in RTL template placeholders""" + template_data_cp = template_data + for key in replace_dict: + replacement_line = "\n".join(replace_dict[key]) + template_data_cp = template_data_cp.replace(key, replacement_line) + return template_data_cp + + def dump_rtl_data(self, dest_dir, filename, data): + """Dump filled-in-template RTL files for future synthesis step""" + # when generating template files, handle a special case: + # if the filename contains the word "template", replace that + # with the node name to distinguish between instances + if "template" in filename: + filename = self.get_nodeattr("gen_top_module") + ".v" + with open(os.path.join(dest_dir, filename), "w") as f: + f.write(data) + return + + def generate_hdl(self, model, fpgapart, clk): + """Prepare HDL files from templates for synthesis""" + # Generate a dictionary of values to put in RTL template + code_gen_dict = self.prepare_codegen_rtl_values(model) + + # Retrieve the destination directory for the final RTL files + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + # Set the 'gen_top_module' attribute for use later + # by PyVerilator and IPI generation + self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) + + weights = model.get_initializer(self.onnx_node.input[1]) + weights_fname = f"{code_gen_dir}/memblock.dat" + self.make_weight_file(weights, "decoupled", weights_fname) + + for rtl_file_path in self.get_rtl_file_paths(): + # read in original RTL template file + template_data = self.get_rtl_template_data(rtl_file_path) + # apply code generation to templates + data = self.fill_in_rtl_template_data(code_gen_dict, template_data) + # dump filled-in template to destination directory for compilation + file_only_path = rtl_file_path.split("/")[-1] + self.dump_rtl_data(code_gen_dir, file_only_path, data) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + # i.e. during the HLSSynthIP() transformation + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + return + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() + ] + dat_files = self.get_all_meminit_filenames(abspath=True) + single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + for dat_file in dat_files: + shutil.copy(dat_file, single_src_dir) + + # build the Verilator emulation library + sim = PyVerilator.build( + verilog_files, + build_dir=single_src_dir, + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + auto_eval=False, + ) + + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + if mode == "cppsim": + Thresholding.execute_node(self, context, graph) + elif mode == "rtlsim": + node = self.onnx_node + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for Thresholding_rtl") + in_ind += 1 + + # Create a PyVerilator wrapper of the RTLSim .so + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + io_names = self.get_verilog_top_module_intf_names() + istream_name = io_names["s_axis"][0][0] + ostream_name = io_names["m_axis"][0][0] + io_dict = { + "inputs": {istream_name: inp}, + "outputs": {ostream_name: []}, + } + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + sname = "_" + + # Change into so directory to ensure threshold files can be found + rtlsim_so = self.get_nodeattr("rtlsim_so") + so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) + olcwd = os.getcwd() + os.chdir(so_dir) + num_out_values = self.get_number_output_values() + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + os.chdir(olcwd) + output = io_dict["outputs"][ostream_name] + + # Manage output data + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipi(self): + """Constructs and returns the TCL commands for node instantiation as an RTL + block.""" + rtl_file_list = [ + x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() + ] + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name + cmd = ["file mkdir %s" % source_target] + + for rtl_file in rtl_file_list: + cmd.append( + "add_files -copy_to %s -norecurse %s" + % (source_target, os.path.join(code_gen_dir, rtl_file)) + ) + + # Create an RTL block, not an IP core (-type ip) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ) + + return cmd + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + if self.get_nodeattr("runtime_writeable_weights") == 1: + intf_names["axilite"] = ["s_axilite"] + + return intf_names + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights (thresholds) in appropriate + format for this layer. This file can be used for either synthesis or + run-time reconfig of weights. + + Arguments: + + * weights : numpy array with weights to be put into the file + * weight_file_name : filename for the weight file to be generated + + """ + threshold_tensor = self.get_hw_compatible_threshold_tensor(weights) + tdt = self.get_weight_datatype() + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + + pe = self.get_nodeattr("PE") + ch = self.get_nodeattr("NumChannels") + n_thres_steps = self.get_nodeattr("numSteps") + + # If a single threshold value is found, broadcast the value + n_thres_steps = self.get_nodeattr("numSteps") + expected_shape = (ch, n_thres_steps) + if weights.shape == (1, 1): + weights = np.broadcast_to(weights, expected_shape) + + width_padded = roundup_to_integer_multiple(weights.shape[1], 4) + weight_padded = np.zeros((weights.shape[0], width_padded)) + weight_padded[: weights.shape[0], :n_thres_steps] = weights + weight_stream = [] + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) + padding = np.zeros(width_padded, dtype=np.int32) + + chan_ind = 0 + cf = ch // pe + for fold in range(cf): + for c in range(2 ** (pe - 1).bit_length()): + if (c == 0 or c % pe != 0) and c < pe: + for w in weight_padded[chan_ind]: + w_packed = pack_innermost_dim_as_hex_string( + [w], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) + chan_ind += 1 + else: + for z in padding: + w_packed = pack_innermost_dim_as_hex_string( + [z], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py new file mode 100644 index 0000000000..b315d913e4 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -0,0 +1,285 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.util.fpgadataflow import is_versal + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class VVAU_rtl(VVAU, RTLBackend): + """Class that corresponds to finn-rtl Vector Vector Unit.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VVAU.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + VVAU.execute_node(self, context, graph) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for VectorVectorActivation") + in_ind += 1 + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + reset_rtlsim(sim) + toggle_clk(sim) + + if mem_mode in ["external", "internal_decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def lut_estimation(self): + return 0 + + def dsp_estimation(self): + Q = self.get_nodeattr("SIMD") + return int(np.ceil(Q / 3)) + + def instantiate_ip(self, cmd): + # instantiate the RTL IP + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert ( + clk > 0.741 + ), """Infeasible clk target of {} ns has been set, + consider lowering the targeted clock frequency!""".format( + clk + ) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len + return dsp_chain_len + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the + # supported RTL compute core + assert ( + self.get_nodeattr("resType") != "lut" + ), """LUT-based RTL-VVU implementation currently not supported! + Please change resType for {} to 'dsp' or consider switching to HLS-based VVAU!""".format( + self.onnx_node.name + ) + is_versal_family = is_versal(fpgapart) + assert ( + is_versal_family + ), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices" + + return "mvu_vvu_8sx9_dsp58" + + def prepare_codegen_default(self, fpgapart, clk): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(0)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + mw = int(np.prod(self.get_nodeattr("Kernel"))) + code_gen_dict["$MW$"] = [str(mw)] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + + return template_path, code_gen_dict + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py new file mode 100644 index 0000000000..2e4d647b22 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -0,0 +1,64 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from abc import ABC, abstractmethod + + +class RTLBackend(ABC): + """RTLBackend class all custom ops that correspond to a module in finn-rtllib + are using functionality of. Contains different functions every RTL + custom node should have. Some as abstract methods, these have to be filled + when writing a new RTL custom op node.""" + + def get_nodeattr_types(self): + return { + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } + + @abstractmethod + def generate_hdl(self, model, fpgapart, clk): + pass + + @abstractmethod + def prepare_rtlsim(self): + pass + + @abstractmethod + def code_generation_ipi(self): + pass + + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl(model, fpgapart, clk) + + # TODO: Implement alternative + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + return "V" diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py new file mode 100644 index 0000000000..4921caeb00 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py @@ -0,0 +1,216 @@ +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# does not do anything at the ONNX node-by-node level, and input-output +# tensor shapes are the same. performs data width conversion at the rtlsim level + + +class StreamingDataWidthConverter(HWCustomOp): + """Abstraction layer for HW implementation of StreamingDataWidthConverter""" + + def get_nodeattr_types(self): + my_attrs = { + # shape of input/output tensors + "shape": ("ints", True, []), + # bit width of input and output streams + "inWidth": ("i", True, 0), + "outWidth": ("i", True, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("dataType")] + + def get_normal_input_shape(self, ind=0): + ishape = self.get_nodeattr("shape") + return ishape + + def get_normal_output_shape(self, ind=0): + oshape = self.get_nodeattr("shape") + return oshape + + def get_iowidth_lcm(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + return int(np.lcm(iwidth, owidth)) + + def needs_lcm(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + maxwidth = max(iwidth, owidth) + minwidth = min(iwidth, owidth) + return maxwidth % minwidth != 0 + + def check_divisible_iowidths(self): + pass + + def get_folded_input_shape(self, ind=0): + self.check_divisible_iowidths() + iwidth = self.get_nodeattr("inWidth") + ishape = self.get_normal_input_shape() + dummy_t = np.random.randn(*ishape) + ibits = self.get_input_datatype().bitwidth() + assert ( + iwidth % ibits == 0 + ), """DWC input width must be divisible by + input element bitwidth""" + ielems = int(iwidth // ibits) + ichannels = ishape[-1] + new_shape = [] + for i in ishape[:-1]: + new_shape.append(i) + new_shape.append(int(ichannels // ielems)) + new_shape.append(ielems) + dummy_t = dummy_t.reshape(new_shape) + return dummy_t.shape + + def get_folded_output_shape(self, ind=0): + self.check_divisible_iowidths() + owidth = self.get_nodeattr("outWidth") + oshape = self.get_normal_output_shape() + dummy_t = np.random.randn(*oshape) + obits = self.get_output_datatype().bitwidth() + assert ( + owidth % obits == 0 + ), """DWC output width must be divisible by + input element bitwidth""" + oelems = int(owidth // obits) + ochannels = oshape[-1] + new_shape = [] + for i in oshape[:-1]: + new_shape.append(i) + new_shape.append(int(ochannels // oelems)) + new_shape.append(oelems) + dummy_t = dummy_t.reshape(new_shape) + + return dummy_t.shape + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def get_instream_width(self, ind=0): + in_width = self.get_nodeattr("inWidth") + return in_width + + def get_outstream_width(self, ind=0): + out_width = self.get_nodeattr("outWidth") + return out_width + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""StreamingDWC needs 1 data input""") + + return info_messages + + def execute_node(self, context, graph): + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + def lut_estimation(self): + """Calculates resource estimations for LUTs""" + inw = self.get_instream_width() + outw = self.get_outstream_width() + + minw = min(inw, outw) + maxw = max(inw, outw) + + # sometimes widths aren't directly divisible + # this requires going up from input width to least common multiple + # then down to output width + intw = abs(maxw * minw) // math.gcd(maxw, minw) + + # we assume a shift-based implementation + # even if we don't use LUTs explicitly, we make some unavailable + # to other logic because they're tied into the DWC control sets + + cnt_luts = 0 + cset_luts = 0 + + if inw != intw: + cnt_luts += abs(math.ceil(math.log(inw / intw, 2))) + cset_luts += intw + if intw != outw: + cnt_luts += abs(math.ceil(math.log(intw / outw, 2))) + cset_luts += outw + + return int(cnt_luts + cset_luts) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py deleted file mode 100644 index baf4aed502..0000000000 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ /dev/null @@ -1,540 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import math -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -# does not do anything at the ONNX node-by-node level, and input-output -# tensor shapes are the same. performs data width conversion at the rtlsim level - - -class StreamingDataWidthConverter_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch - function.""" - - def get_nodeattr_types(self): - my_attrs = { - # shape of input/output tensors - "shape": ("ints", True, []), - # bit width of input and output streams - "inWidth": ("i", True, 0), - "outWidth": ("i", True, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), - # Toggle between hls or IPI implementation - # hls - use the hls generated IP during stitching - # vivado - use the AXI Infrastructure DWC - "impl_style": ("s", False, "hls", {"hls", "vivado"}), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_normal_input_shape(self, ind=0): - ishape = self.get_nodeattr("shape") - return ishape - - def get_normal_output_shape(self, ind=0): - oshape = self.get_nodeattr("shape") - return oshape - - def check_divisible_iowidths(self): - impl_style = self.get_nodeattr("impl_style") - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - if impl_style == "vivado": - # the AXIS IP we use in vivado mode only supports - # stream widths that are divisible by 8 - iwidth_d8 = iwidth % 8 == 0 - owidth_d8 = owidth % 8 == 0 - assert ( - iwidth_d8 and owidth_d8 - ), """DWC impl_style=vivado requires - stream widths that are divisible by 8: (%d, %d)""" % ( - iwidth, - owidth, - ) - - def get_iowidth_lcm(self): - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - return int(np.lcm(iwidth, owidth)) - - def needs_lcm(self): - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - maxwidth = max(iwidth, owidth) - minwidth = min(iwidth, owidth) - impl_style = self.get_nodeattr("impl_style") - return (impl_style == "hls") and (maxwidth % minwidth != 0) - - def get_folded_input_shape(self, ind=0): - self.check_divisible_iowidths() - iwidth = self.get_nodeattr("inWidth") - ishape = self.get_normal_input_shape() - dummy_t = np.random.randn(*ishape) - ibits = self.get_input_datatype().bitwidth() - assert ( - iwidth % ibits == 0 - ), """DWC input width must be divisible by - input element bitwidth""" - ielems = int(iwidth // ibits) - ichannels = ishape[-1] - new_shape = [] - for i in ishape[:-1]: - new_shape.append(i) - new_shape.append(int(ichannels // ielems)) - new_shape.append(ielems) - dummy_t = dummy_t.reshape(new_shape) - return dummy_t.shape - - def get_folded_output_shape(self, ind=0): - self.check_divisible_iowidths() - owidth = self.get_nodeattr("outWidth") - oshape = self.get_normal_output_shape() - dummy_t = np.random.randn(*oshape) - obits = self.get_output_datatype().bitwidth() - assert ( - owidth % obits == 0 - ), """DWC output width must be divisible by - input element bitwidth""" - oelems = int(owidth // obits) - ochannels = oshape[-1] - new_shape = [] - for i in oshape[:-1]: - new_shape.append(i) - new_shape.append(int(ochannels // oelems)) - new_shape.append(oelems) - dummy_t = dummy_t.reshape(new_shape) - - return dummy_t.shape - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_instream_width(self, ind=0): - in_width = self.get_nodeattr("inWidth") - return in_width - - def get_outstream_width(self, ind=0): - out_width = self.get_nodeattr("outWidth") - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""StreamingDWC needs 1 data input""") - - return info_messages - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - numReps = 1 - numInWords = int(np.prod(self.get_folded_input_shape()[:-1])) - inWidth = self.get_nodeattr("inWidth") - outWidth = self.get_nodeattr("outWidth") - self.code_gen_dict["$DEFINES$"] = [ - "#define InWidth %d " % inWidth, - "#define OutWidth %d " % outWidth, - "#define NumInWords %d " % numInWords, - "#define numReps %d" % numReps, - ] - if self.needs_lcm(): - lcmWidth = self.get_iowidth_lcm() - assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation" - numLCMToOut = numInWords // (lcmWidth / inWidth) - self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth) - self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut)) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - if self.needs_lcm(): - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> intermediate ("intermediate");'.format( - self.get_iowidth_lcm() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - # TODO continue with fxns below, they are copy-pasted - op = "StreamingDataWidthConverter_Batch" - if self.needs_lcm(): - self.code_gen_dict["$DOCOMPUTE$"] = [ - 'hls::stream> intermediate ("intermediate");'.format( - self.get_iowidth_lcm() - ), - "%s(in0_%s, intermediate, numReps);" - % (op, self.hls_sname()), - "%s(intermediate, out_%s, numReps);" - % (op, self.hls_sname()), - ] - else: - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0_%s, out_%s, numReps);" - % (op, self.hls_sname(), self.hls_sname()) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - in_packed_bits = self.get_instream_width() - in_packed_hls_type = "ap_uint<%d>" % in_packed_bits - out_packed_bits = self.get_outstream_width() - out_packed_hls_type = "ap_uint<%d>" % out_packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - in_packed_hls_type, - self.hls_sname(), - out_packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if self.needs_lcm(): - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - impl_style = self.get_nodeattr("impl_style") - node = self.onnx_node - exp_shape = self.get_normal_input_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - assert impl_style == "hls", "DWC cppsim only possible when impl_style==hls" - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - assert impl_style == "hls", "DWC rtlsim only possible when impl_style==hls" - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." - - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # reshape input into folded shape - reshaped_input = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = reshaped_input.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - output = inp - output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) - context[node.output[0]] = output - - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(exp_shape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to "rtlsim" """.format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert context[node.output[0]].shape == tuple( - exp_shape - ), """Output - shape doesn't match expected shape, should be same as input shape""" - - def code_generation_ipi(self): - impl_style = self.get_nodeattr("impl_style") - if impl_style == "hls": - return super().code_generation_ipi() - elif impl_style == "vivado": - cmd = [] - node_name = self.onnx_node.name - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate and configure DWC - cmd.append( - "create_bd_cell -type ip " - "-vlnv xilinx.com:ip:axis_dwidth_converter:1.1 /%s/dwc" % node_name - ) - cmd.append( - "set_property -dict " - "[list CONFIG.S_TDATA_NUM_BYTES.VALUE_SRC USER] " - "[get_bd_cells /%s/dwc]" % node_name - ) - cmd.append( - "set_property -dict " - "[list CONFIG.S_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]" - % (np.ceil(self.get_instream_width() / 8), node_name) - ) - cmd.append( - "set_property -dict " - "[list CONFIG.M_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]" - % (np.ceil(self.get_outstream_width() / 8), node_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/dwc/M_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/dwc/S_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aresetn]" - % (node_name, rst_name, node_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aclk]" - % (node_name, clk_name, node_name) - ) - return cmd - else: - raise Exception( - "DWC implementation style %s not supported, please use hls or vivado" % impl_style - ) - - def lut_estimation(self): - """Calculates resource estimations for LUTs""" - inw = self.get_instream_width() - outw = self.get_outstream_width() - - minw = min(inw, outw) - maxw = max(inw, outw) - - # sometimes withs aren't directly divisible - # this requires going up from input width to least common multiple - # then down to output width - intw = abs(maxw * minw) // math.gcd(maxw, minw) - - # we assume a shift-based implementation - # even if we don't use LUTs explicitly, we make some unavailable - # to other logic because they're tied into the DWC control sets - - cnt_luts = 0 - cset_luts = 0 - - if inw != intw: - cnt_luts += abs(math.ceil(math.log(inw / intw, 2))) - cset_luts += intw - if intw != outw: - cnt_luts += abs(math.ceil(math.log(intw / outw, 2))) - cset_luts += outw - - return int(cnt_luts + cset_luts) - - def prepare_rtlsim(self): - assert self.get_nodeattr("impl_style") != "vivado", ( - "StreamingDataWidthConverter impl_style " - "cannot be vivado for rtlsim. Only impl_style=rtl supported." - ) - super().prepare_rtlsim() - - def code_generation_ipgen(self, model, fpgapart, clk): - # no codegen required for impl_style=vivado since - # that uses premade, configurable AXIS IP - if self.get_nodeattr("impl_style") == "hls": - super().code_generation_ipgen(model, fpgapart, clk) - - def ipgen_singlenode_code(self): - # no IP generation required for impl_style=vivado since - # that uses premade, configurable AXIS IP - if self.get_nodeattr("impl_style") == "hls": - super().ipgen_singlenode_code() - else: - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # set ipgen_path and ip_path so that HLSSynthIP - # and CreatedStitchedIP transformations do not complain - self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_rtl.py deleted file mode 100644 index 4f592bafaa..0000000000 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_rtl.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import shutil -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - - -class StreamingDataWidthConverter_rtl(HLSCustomOp): - """Class that corresponds to finn-rtllib datawidth converter - module.""" - - def get_nodeattr_types(self): - my_attrs = { - # shape of input/output tensors - "shape": ("ints", True, []), - # bit width of input and output streams - "inWidth": ("i", True, 0), - "outWidth": ("i", True, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_normal_input_shape(self, ind=0): - ishape = self.get_nodeattr("shape") - return ishape - - def get_normal_output_shape(self, ind=0): - oshape = self.get_nodeattr("shape") - return oshape - - def check_divisible_iowidths(self): - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - # the rtl module only supports - # stream widths that are divisible by - # integer width ratios - iwidth_d = iwidth % owidth == 0 - owidth_d = owidth % iwidth == 0 - assert ( - iwidth_d or owidth_d - ), """RTL implementation of DWC requires - stream widths that are integer width ratios - from each other. Input width is set to %s - and output width is set to %s """ % ( - iwidth, - owidth, - ) - - def get_folded_input_shape(self, ind=0): - self.check_divisible_iowidths() - iwidth = self.get_nodeattr("inWidth") - ishape = self.get_normal_input_shape() - dummy_t = np.random.randn(*ishape) - ibits = self.get_input_datatype().bitwidth() - assert ( - iwidth % ibits == 0 - ), """DWC input width must be divisible by - input element bitwidth""" - ielems = int(iwidth // ibits) - ichannels = ishape[-1] - new_shape = [] - for i in ishape[:-1]: - new_shape.append(i) - new_shape.append(int(ichannels // ielems)) - new_shape.append(ielems) - dummy_t = dummy_t.reshape(new_shape) - return dummy_t.shape - - def get_folded_output_shape(self, ind=0): - self.check_divisible_iowidths() - owidth = self.get_nodeattr("outWidth") - oshape = self.get_normal_output_shape() - dummy_t = np.random.randn(*oshape) - obits = self.get_output_datatype().bitwidth() - assert ( - owidth % obits == 0 - ), """DWC output width must be divisible by - input element bitwidth""" - oelems = int(owidth // obits) - ochannels = oshape[-1] - new_shape = [] - for i in oshape[:-1]: - new_shape.append(i) - new_shape.append(int(ochannels // oelems)) - new_shape.append(oelems) - dummy_t = dummy_t.reshape(new_shape) - - return dummy_t.shape - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_instream_width(self, ind=0): - in_width = self.get_nodeattr("inWidth") - return in_width - - def get_outstream_width(self, ind=0): - out_width = self.get_nodeattr("outWidth") - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - raise Exception( - """cppsim not possible for StreamingDataWidthConverter_rtl, - please set exec_mode to rtlsim""" - ) - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == tuple( - exp_ishape - ), """Input shape doesn't - match expected shape.""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - assert context[node.output[0]].shape == tuple( - exp_oshape - ), """Output shape doesn't match expected shape.""" - - def get_template_values(self): - topname = self.get_verilog_top_module_name() - ibits = self.get_instream_width() - obits = self.get_outstream_width() - code_gen_dict = { - "IBITS": int(ibits), - "OBITS": int(obits), - "TOP_MODULE_NAME": topname, - } - return code_gen_dict - - def generate_hdl(self): - rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/hdl" - template_path = rtlsrc + "/dwc_template.v" - code_gen_dict = self.get_template_values() - # save top module name so we can refer to it after this node has been renamed - # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) - self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) - - # apply code generation to templates - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - with open(template_path, "r") as f: - template = f.read() - for key_name in code_gen_dict: - key = "$%s$" % key_name - template = template.replace(key, str(code_gen_dict[key_name])) - - with open( - os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), - "w", - ) as f: - f.write(template) - - sv_files = ["dwc_axi.sv", "dwc.sv"] - for sv_file in sv_files: - shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir) - # set ipgen_path and ip_path so that HLS-Synth transformation - # and stich_ip transformation do not complain - self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) - - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - # Modified to use generated (System-)Verilog instead of HLS output products - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] - verilog_files = [ - "dwc_axi.sv", - "dwc.sv", - self.get_nodeattr("gen_top_module") + ".v", - ] - - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim - - def code_generation_ipi(self): - """Constructs and returns the TCL for node instantiation in Vivado IPI.""" - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - - sourcefiles = [ - "dwc_axi.sv", - "dwc.sv", - self.get_nodeattr("gen_top_module") + ".v", - ] - - sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] - - cmd = [] - for f in sourcefiles: - cmd += ["add_files -norecurse %s" % (f)] - cmd += [ - "create_bd_cell -type module -reference %s %s" - % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) - ] - return cmd - - def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates C++ code and tcl script for IP generation. - Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - pass - - def code_generation_cppsim(self, model): - """Normally: Generates C++ code for simulation (cppsim).""" - pass - - def compile_singlenode_code(self): - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass diff --git a/src/finn/custom_op/fpgadataflow/streamingeltwise.py b/src/finn/custom_op/fpgadataflow/streamingeltwise.py new file mode 100644 index 0000000000..4681c144f7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingeltwise.py @@ -0,0 +1,216 @@ +# Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class StreamingEltwise(HWCustomOp): + """Abstraction layer for HW implementation of StreamingEltwise""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType0": ("s", True, ""), + "inputDataType1": ("s", True, ""), + # type of EltwiseFunction for the operation + "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) + return my_attrs + + def get_eltwise_op_lambda(self): + eltwise_op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + tin0 = idt0.get_hls_datatype_str() + tin1 = idt1.get_hls_datatype_str() + tout = odt.get_hls_datatype_str() + eltwise_ops = { + # "Add": "[](auto a, auto b) { return a + b; }", + # "Sub": "[](auto a, auto b) { return a - b; }", + # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", + "Add": f"add<{tin0}, {tin1}, {tout}>()", + "Sub": f"sub<{tin0}, {tin1}, {tout}>()", + "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", + } + return eltwise_ops[eltwise_op] + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt0 = model.get_tensor_datatype(node.input[0]) + if idt0 != self.get_input_datatype(0): + warn_str = "inputDataType0 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(0)), + str(idt0), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType0", idt0.name) + idt1 = model.get_tensor_datatype(node.input[1]) + if idt1 != self.get_input_datatype(1): + warn_str = "inputDataType1 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(1)), + str(idt1), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType1", idt1.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType" + str(ind))] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + assert idt0.signed() == idt1.signed(), ( + "%s: Inputs must have same signedness" % self.onnx_node.name + ) + idt0_min, idt0_max = idt0.min(), idt0.max() + idt1_min, idt1_max = idt1.min(), idt1.max() + cands = [ + idt0_min - idt1_min, + idt0_min - idt1_max, + idt0_max - idt1_min, + idt0_max - idt1_max, + ] + largest_magnitude = max(map(abs, cands)) + if op == "Add": + if idt0.signed(): + return DataType.get_smallest_possible(idt0.min() + idt1.min()) + else: + return DataType.get_smallest_possible(idt0.max() + idt1.max()) + elif op == "Sub": + return DataType.get_smallest_possible(-largest_magnitude) + elif op == "AbsDiff": + return DataType.get_smallest_possible(largest_magnitude) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype(ind).bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # simulate behavior using Python + node = self.onnx_node + inp0_values = context[node.input[0]] + inp1_values = context[node.input[1]] + eltwiseOp = self.get_nodeattr("eltwiseOp") + oshape = context[node.output[0]].shape + ishape0 = inp0_values.shape + ishape1 = inp1_values.shape + assert ishape0 == ishape1, "Shapes of inputs should be the same for Streamingeltwise" + # subtraction + result = inp0_values - inp1_values + if eltwiseOp == "Sub": + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + elif eltwiseOp == "AbsDiff": + context[node.output[0]] = np.abs(np.asarray(result, dtype=np.float32)).reshape(oshape) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (node.name, eltwiseOp)) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index 1249bc1251..1556575b00 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,23 +27,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math import numpy as np -import os -import subprocess import warnings from qonnx.core.datatype import DataType -from shutil import copy -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_finn_root -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -from . import templates - -class StreamingFIFO(HLSCustomOp): +class StreamingFIFO(HWCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - self.strm_fifo_wrapper = templates.strm_fifo_wrapper def get_nodeattr_types(self): my_attrs = super().get_nodeattr_types() @@ -53,12 +45,10 @@ def get_nodeattr_types(self): "depth": ("i", True, 0), # folded shape of input/output "folded_shape": ("ints", True, []), + # normal shape of input/output + "normal_shape": ("ints", True, []), # FINN DataTypes for inputs/outputs "dataType": ("s", True, ""), - # Toggle between hls or IPI implementation - # rtl - use the hls generated IP during stitching - # vivado - use the AXI Infrastructure FIFO - "impl_style": ("s", False, "rtl", {"rtl", "vivado"}), # FPGA resource type for FIFOs when impl_style is vivado # auto -- let Vivado decide # block -- use BRAM @@ -80,22 +70,6 @@ def get_nodeattr_types(self): return my_attrs - def get_adjusted_depth(self): - impl = self.get_nodeattr("impl_style") - depth = self.get_nodeattr("depth") - if impl == "vivado": - old_depth = depth - # round up depth to nearest power-of-2 - # Vivado FIFO impl may fail otherwise - depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth - if old_depth != depth: - warnings.warn( - "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" - % (self.onnx_node.name, old_depth, depth) - ) - - return depth - def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() oshape = self.get_normal_output_shape() @@ -128,111 +102,12 @@ def get_verilog_top_module_intf_names(self): ret["ap_none"] = ["maxcount"] return ret - def get_verilog_top_module_name(self): - "Return the Verilog top module name for this node." - - node = self.onnx_node - prefixed_top_name = "%s" % (node.name) - return prefixed_top_name - - def code_generation_ipgen(self, model, fpgapart, clk): - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_dir = "{}/project_{}/sol1/impl/verilog".format(code_gen_dir, self.onnx_node.name) - os.makedirs(verilog_dir) - # copy Q_srl.v from finn-rtllib to verilog directory - memstream_dir = get_finn_root() + "/finn-rtllib/memstream/hdl/" - Q_file = os.path.join(memstream_dir, "Q_srl.v") - copy(Q_file, verilog_dir) - - # empty code gen dictionary for new entries - self.code_gen_dict.clear() - self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)] - self.code_gen_dict["$LAYER_NAME$"] = [ - "{}_{}".format(self.onnx_node.name, self.onnx_node.name) - ] - # make instream width a multiple of 8 for axi interface - in_width = self.get_instream_width_padded() - count_width = int(self.get_nodeattr("depth") - 1).bit_length() - self.code_gen_dict["$COUNT_RANGE$"] = ["[{}:0]".format(count_width - 1)] - self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)] - self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)] - self.code_gen_dict["$WIDTH$"] = [str(in_width)] - self.code_gen_dict["$DEPTH$"] = [str(self.get_nodeattr("depth"))] - self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()] - - template = self.strm_fifo_wrapper - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - f = open(os.path.join(verilog_dir, "{}.v".format(self.onnx_node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def ipgen_singlenode_code(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_dir = "{}/project_{}/sol1/impl/verilog".format(code_gen_dir, self.onnx_node.name) - # prepare the IP packaging tcl template - template = templates.ip_package_tcl - self.code_gen_dict.clear() - self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)] - # note: setting the root dir as absolute can cause path problems - # the ipgen script will be invoked from the sources dir so root_dir=. is OK - self.code_gen_dict["$VERILOG_DIR$"] = ["."] - self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()] - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - f = open(os.path.join(verilog_dir, "package_ip.tcl"), "w") - f.write(template) - f.close() - # create a shell script and call Vivado to invoke the IP pkg script - make_project_sh = verilog_dir + "/make_ip.sh" - working_dir = os.environ["PWD"] - with open(make_project_sh, "w") as f: - f.write("#!/bin/bash \n") - f.write("cd {}\n".format(verilog_dir)) - f.write("vivado -mode batch -source package_ip.tcl\n") - f.write("cd {}\n".format(working_dir)) - bash_command = ["bash", make_project_sh] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate() - # set ipgen_path and ip_path to point to the new packaged IP - self.set_nodeattr("ipgen_path", verilog_dir) - self.set_nodeattr("ip_path", verilog_dir) - vlnv = "xilinx.com:hls:%s:1.0" % (self.onnx_node.name) - self.set_nodeattr("ip_vlnv", vlnv) - self.code_gen_dict.clear() - def get_normal_input_shape(self, ind=0): depth = self.get_adjusted_depth() - assert depth >= 2, """Depth is too low""" + assert depth >= 1, """Depth is too low""" if depth > 256 and self.get_nodeattr("impl_style") == "rtl": warnings.warn("Depth is high, set between 2 and 256 for efficient SRL implementation") - # derive normal shape from folded shape - # StreamingFIFOs are inserted in between fpgadataflow nodes - # the folded shape could be for example (1, nf, pe) - # with nf (neuron folding): mh // pe - # the normal input shape is in this case (1, mh) - # so to achieve this the two inner dimensions are multiplied - # and together with all previous dimensions - # this gives the normal input shape - - folded_shape = self.get_nodeattr("folded_shape") - # extract inner dimension - inner_dim = folded_shape[-1] - # multiply with the next inner dimension - folding_factor = folded_shape[-2] * inner_dim - normal_ishape = [] - # create the normal_ishape - for i in range(len(folded_shape) - 2): - normal_ishape.append(folded_shape[i]) - normal_ishape.append(folding_factor) - - return normal_ishape + return self.get_nodeattr("normal_shape") def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() @@ -262,154 +137,13 @@ def get_output_datatype(self, ind=0): return DataType[self.get_nodeattr("dataType")] def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") node = self.onnx_node - inp = context[node.input[0]] - exp_shape = self.get_normal_input_shape() - - if mode == "cppsim": - output = inp - output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) - context[node.output[0]] = output - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # create a npy file for the input of the node - assert ( - str(inp.dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = inp.reshape(expected_inp_shape) - if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = DataType[self.get_nodeattr("dataType")] - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - output = self.rtlsim(sim, inp) - odt = DataType[self.get_nodeattr("dataType")] - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) + context[node.output[0]] = context[node.input[0]] def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass - - def code_generation_ipi(self): - impl_style = self.get_nodeattr("impl_style") - if impl_style == "rtl": - return super().code_generation_ipi() - elif impl_style == "vivado": - cmd = [] - node_name = self.onnx_node.name - depth = self.get_adjusted_depth() - ram_style = self.get_nodeattr("ram_style") - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate and configure DWC - cmd.append( - "create_bd_cell -type ip " - "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name - ) - cmd.append( - "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] " - "[get_bd_cells /%s/fifo]" % (depth, node_name) - ) - cmd.append( - "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] " - "[get_bd_cells /%s/fifo]" % (ram_style, node_name) - ) - cmd.append( - "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] " - "[get_bd_cells /%s/fifo]" % (np.ceil(self.get_outstream_width() / 8), node_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] " - "[get_bd_pins %s/fifo/s_axis_aresetn]" % (node_name, rst_name, node_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] " - "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name) - ) - return cmd - else: - raise Exception( - "FIFO implementation style %s not supported, please use rtl or vivado" % impl_style - ) - def bram_estimation(self): """Calculates resource estimation for BRAM""" impl = self.get_nodeattr("impl_style") @@ -473,10 +207,3 @@ def lut_estimation(self): ram_luts = 0 return int(address_luts + ram_luts) - - def prepare_rtlsim(self): - assert self.get_nodeattr("impl_style") != "vivado", ( - "StreamingFIFO impl_style " - "cannot be vivado for rtlsim. Only impl_style=rtl supported." - ) - super().prepare_rtlsim() diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py new file mode 100755 index 0000000000..59a8f092d0 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py @@ -0,0 +1,236 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import onnxruntime as rt +import warnings +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim +from qonnx.util.basic import qonnx_make_model + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# TODO: consider splitting this into separate implementations for 1D and 2D +# similar to what we do for ConvolutionInputGenerator + + +class StreamingMaxPool(HWCustomOp): + """Abstraction layer for HW implementation of StreamingMaxPool""" + + def get_nodeattr_types(self): + my_attrs = { + "ImgDim": ("ints", True, []), # [H, W] = [Y, X] + "PoolDim": ("ints", True, []), # [H, W] = [Y, X] + "NumChannels": ("i", True, 0), + # parallelism control - only supported for 1D maxpool + "PE": ("i", False, 0), + # round up (instead of down) output size - only supported for 1D maxpool + "CeilMode": ("i", False, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("dataType")] + + def get_1d_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # assume the dummy ('1') dimension is the Y-dimension, i.e. + # images and kernels (and their attributes) of dimension + # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] + ifm_dim = self.get_nodeattr("ImgDim") + k = self.get_nodeattr("PoolDim") + ifm_ch = self.get_nodeattr("NumChannels") + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + k = k[::-1] + return (ifm_dim, k, ifm_ch) + + def is_1d(self): + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + return (ifm_dim[0] == 1) and (k[0] == 1) + + def get_normal_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + ifm_ch = self.get_nodeattr("NumChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + ifm_ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + nf = int(ifm_ch / pe) + if self.is_1d(): + folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe) + else: + folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + k_h, k_w = tuple(self.get_nodeattr("PoolDim")) + ifm_ch = self.get_nodeattr("NumChannels") + ceil_mode = self.get_nodeattr("CeilMode") + if not self.is_1d(): + assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0" + assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0" + ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode) + ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode) + oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) + return oshape + + def get_folded_output_shape(self, ind=0): + # even though there is no folding in the current hlslib op, + # insert a time multiplexing axis to remain compatible with the + # shapes produced by the rest of the dataflow pipeline + ifm_ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + nf = int(ifm_ch / pe) + ret = list(self.get_normal_output_shape()) + if self.is_1d(): + ret[-1] = nf + ret.append(pe) + else: + ret.insert(-1, 1) + return tuple(ret) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def get_exp_cycles(self): + # derived from StreamingMaxPool_Batch loop nest + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + + warnings.warn( + """Estimated latency for layer {} can be lower than + actual latency!""".format( + self.onnx_node.name + ) + ) + if self.is_1d(): + _, _, _, nf, _ = self.get_folded_output_shape() + ceil_mode = self.get_nodeattr("CeilMode") + ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) + exp_cycles = ofm_dim * nf * (k[1] + 1) + return int(exp_cycles) + else: + # TODO: adjust inaccurate formula + return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) + + def get_instream_width(self, ind=0): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + ifm_ch = self.get_nodeattr("NumChannels") + if self.is_1d(): + in_width = int(dt_bits * pe) + else: + in_width = int(dt_bits * ifm_ch) + return in_width + + def get_outstream_width(self, ind=0): + """For streaming maxpool out stream width is the same as in stream width""" + return self.get_instream_width() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for StreamingMaxPool." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def execute_node(self, context, graph): + # create a standard add node to help calculate the result + node = self.onnx_node + kernel_shape = self.get_nodeattr("PoolDim") + ceil_mode = self.get_nodeattr("CeilMode") + inp_values = context[node.input[0]] + dummy_out = context[node.output[0]] + # convert i/o NHWC -> NCHW + inp_values = np.transpose(inp_values, (0, 3, 1, 2)) + dummy_out = np.transpose(dummy_out, (0, 3, 1, 2)) + # handle 1d case + ishape = inp_values.shape + if ishape[2] == 1 or ishape[3] == 1: + inp_values = inp_values.reshape(ishape[0], ishape[1], ishape[2] * ishape[3]) + kernel_shape = [kernel_shape[0] * kernel_shape[1]] + # execute as regular MaxPool + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, inp_values.shape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, dummy_out.shape) + node_mp = helper.make_node( + "MaxPool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=kernel_shape, + strides=kernel_shape, + ceil_mode=ceil_mode, + ) + graph_mp = helper.make_graph( + nodes=[node_mp], + name="single-mp-exec", + inputs=[inp], + outputs=[outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_mp = qonnx_make_model(graph_mp, **onnx_kwargs) + idict = {node.input[0]: inp_values} + sess = rt.InferenceSession(model_mp.SerializeToString()) + result = sess.run(None, idict) + result = np.asarray(result, dtype=np.float32).reshape(dummy_out.shape) + # convert output NCHW -> NHWC + result = np.transpose(result, (0, 2, 3, 1)) + context[node.output[0]] = result diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py deleted file mode 100755 index 8f294da4ac..0000000000 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ /dev/null @@ -1,441 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType -from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -# TODO: consider splitting this into separate implementations for 1D and 2D -# similar to what we do for ConvolutionInputGenerator - - -class StreamingMaxPool_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib StreamingMaxPool_batch function.""" - - def get_nodeattr_types(self): - my_attrs = { - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - "PoolDim": ("ints", True, []), # [H, W] = [Y, X] - "NumChannels": ("i", True, 0), - # parallelism control - only supported for 1D maxpool - "PE": ("i", False, 0), - # round up (instead of down) output size - only supported for 1D maxpool - "CeilMode": ("i", False, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_1d_attrs_normalized(self): - # support both (1, D) and (D, 1) cases transparently: - # assume the dummy ('1') dimension is the Y-dimension, i.e. - # images and kernels (and their attributes) of dimension - # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] - ifm_dim = self.get_nodeattr("ImgDim") - k = self.get_nodeattr("PoolDim") - ifm_ch = self.get_nodeattr("NumChannels") - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - k = k[::-1] - return (ifm_dim, k, ifm_ch) - - def is_1d(self): - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - return (ifm_dim[0] == 1) and (k[0] == 1) - - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - ifm_ch = self.get_nodeattr("NumChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - ifm_ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - nf = int(ifm_ch / pe) - if self.is_1d(): - folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe) - else: - folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - k_h, k_w = tuple(self.get_nodeattr("PoolDim")) - ifm_ch = self.get_nodeattr("NumChannels") - ceil_mode = self.get_nodeattr("CeilMode") - if not self.is_1d(): - assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0" - assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0" - ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode) - ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode) - oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ifm_ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - nf = int(ifm_ch / pe) - ret = list(self.get_normal_output_shape()) - if self.is_1d(): - ret[-1] = nf - ret.append(pe) - else: - ret.insert(-1, 1) - return tuple(ret) - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_exp_cycles(self): - # derived from StreamingMaxPool_Batch loop nest - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - - warnings.warn( - """Estimated latency for layer {} can be lower than - actual latency!""".format( - self.onnx_node.name - ) - ) - if self.is_1d(): - _, _, _, nf, _ = self.get_folded_output_shape() - ceil_mode = self.get_nodeattr("CeilMode") - ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) - exp_cycles = ofm_dim * nf * (k[1] + 1) - return int(exp_cycles) - else: - # TODO: adjust inaccurate formula - return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) - - def get_instream_width(self, ind=0): - dt_bits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - ifm_ch = self.get_nodeattr("NumChannels") - if self.is_1d(): - in_width = int(dt_bits * pe) - else: - in_width = int(dt_bits * ifm_ch) - return in_width - - def get_outstream_width(self, ind=0): - """For streaming maxpool out stream width is the same as in stream width""" - return self.get_instream_width() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for StreamingMaxPool." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""StreamingMaxPool_Batch needs 1 data input""") - - return info_messages - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] - - def defines(self, var): - numReps = 1 - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - ceil_mode = self.get_nodeattr("CeilMode") - output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) - - if self.is_1d(): - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim {}\n #define PoolDim {}\n - #define NumChannels {}\n #define PE {}\n #define OutputSize {} - \n #define numReps {}""".format( - ifm_dim[1], - k[1], - self.get_nodeattr("NumChannels"), - self.get_nodeattr("PE"), - output_size, - numReps, - ) - ] - else: - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim {}\n #define PoolDim {}\n - #define NumChannels {}\n #define numReps {}""".format( - ifm_dim[1], - k[1], - self.get_nodeattr("NumChannels"), - numReps, - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - dtype = self.get_input_datatype() - if dtype.bitwidth() == 1: - if self.is_1d(): - raise Exception("Binary 1d MaxPool not implemented on HLS backend") - else: - op = "StreamingMaxPool" - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0_%s, out_%s);" - % (op, self.hls_sname(), self.hls_sname()) - ] - else: - dtype = self.get_input_datatype() - dtype_hls = dtype.get_hls_datatype_str() - minval_str = str(int(dtype.min())) - if self.is_1d(): - op = "StreamingMaxPool_Precision_1d" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """%s(in0_%s, out_%s);""" - % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) - ] - else: - op = "StreamingMaxPool_Precision" - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0_%s, out_%s);" - % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 4e03e6daf9..3d89a0ab23 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -212,49 +212,3 @@ ipx::save_core [ipx::current_core] ipx::archive_core $Top.zip [ipx::current_core] """ - -strm_fifo_wrapper = """ -module $TOPNAME$( -ap_clk, -ap_rst_n, -count, -maxcount, -in0_$HLS_SNAME$_TDATA, -in0_$HLS_SNAME$_TVALID, -in0_$HLS_SNAME$_TREADY, -out_$HLS_SNAME$_TDATA, -out_$HLS_SNAME$_TVALID, -out_$HLS_SNAME$_TREADY -); - -input ap_clk; -input ap_rst_n; -output $COUNT_RANGE$ count; -output $COUNT_RANGE$ maxcount; -input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; -input in0_$HLS_SNAME$_TVALID; -output in0_$HLS_SNAME$_TREADY; -output $OUT_RANGE$ out_$HLS_SNAME$_TDATA; -output out_$HLS_SNAME$_TVALID; -input out_$HLS_SNAME$_TREADY; - -Q_srl #( -.depth($DEPTH$), -.width($WIDTH$) -) -$LAYER_NAME$ -( - .clock(ap_clk), - .reset(!ap_rst_n), - .count(count), - .maxcount(maxcount), - .i_d(in0_$HLS_SNAME$_TDATA), - .i_v(in0_$HLS_SNAME$_TVALID), - .i_r(in0_$HLS_SNAME$_TREADY), - .o_d(out_$HLS_SNAME$_TDATA), - .o_v(out_$HLS_SNAME$_TVALID), - .o_r(out_$HLS_SNAME$_TREADY) -); - -endmodule -""" diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py new file mode 100644 index 0000000000..dde813a293 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -0,0 +1,268 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class Thresholding(HWCustomOp): + """Abstraction layer for HW implementation of Thresholding.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # parallelization; channels thresholded per cycle + "PE": ("i", True, 0), + # number of channels (each may have different thresholds) + "NumChannels": ("i", True, 0), + # number of steps in thresholding function. Used only in decoupled mode + "numSteps": ("i", True, 1), + # FINN DataTypes for inputs, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # initialization value for the thresholding accumulator + "ActVal": ("i", False, 0), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype().name), + str(idt.name), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required Threshold_Batch attributes do not exist.""") + + return info_messages + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_weight_datatype(self): + """Returns FINN DataType of thresholds, here called weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_weightstream_width(self): + """Returns weight stream width""" + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width + + def minimize_accumulator_width(self, model): + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(-tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + # Update QONNX DataType of tensor for consistency + model.set_tensor_datatype(self.onnx_node.input[1], tdt) + return DataType[self.get_nodeattr("weightDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + pe = self.get_nodeattr("PE") + fold = self.calc_tmem() + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def execute_node(self, context, graph): + node = self.onnx_node + inp_values = context[node.input[0]] + th_val = context[node.input[1]] + # MT expects inputs to be in the shape (N,C,H,W) or (N, C) + # if 4D then input values in context are (N,H,W,C) and need to + # be transposed. + # if 2D then inputs can be passed directly to MT function + is_4d = len(inp_values.shape) == 4 + if is_4d: + inp_values = np.transpose(inp_values, (0, 3, 1, 2)) + y = multithreshold(inp_values, th_val) + if is_4d: + y = y.transpose(0, 2, 3, 1) + act = DataType[self.get_nodeattr("outputDataType")] + if act == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + context[node.output[0]] = y + + def calc_tmem(self): + """Calculates and returns TMEM.""" + num_channels = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return num_channels // pe diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index 9c0db1f3df..3348394e05 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,19 +27,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +import onnxruntime as rt import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class UpsampleNearestNeighbour_Batch(HLSCustomOp): - """ - Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. - Upsampling is done with the Nearest Neighbour algorithm. - The layer expects square feature maps for the in and output. - """ +class UpsampleNearestNeighbour(HWCustomOp): + """Abstraction layer for HW implementation of UpsampleNearestNeighbour.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -150,202 +148,44 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] - - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] - - idim = self.get_nodeattr("IFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] - - odim = self.get_nodeattr("OFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] - - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - is_2d = self.get_nodeattr("DimMode") == 0 - batch = self.get_nodeattr("numInputVectors") - if is_2d: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" - % (self.hls_sname(), self.hls_sname()) - ] - else: - assert batch == 1, "1D upsampler currently needs numReps=1" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" - % (self.hls_sname(), self.hls_sname()) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # create a standard resize node to help calculate the result node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + inp_values = context[node.input[0]] + ishape = inp_values.shape + odim = self.get_nodeattr("OFMDim") + idim = self.get_nodeattr("IFMDim") + if ishape[1] == ishape[2]: + scales_val = [1, int(round(odim / idim)), int(round(odim / idim)), 1] + elif ishape[1] > 1 and ishape[2] == 1: + scales_val = [1, int(round(odim / idim)), 1, 1] else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) + warnings.warn( + """HW abstraction layer for Upsample cannot be executed. + Upsampling only supported for 1D H, or 2D square scaling""" ) + oshape = context[node.output[0]].shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + scales = helper.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_resize = helper.make_node( + "Resize", + inputs=[node.input[0], "", "scales"], + outputs=[node.output[0]], + mode="nearest", + ) + graph_resize = helper.make_graph( + nodes=[node_resize], + name="single-resize-exec", + inputs=[inp, scales], + outputs=[outp], + ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" - export_idt = self.get_input_datatype() - self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" + opset_version = 13 + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_resize = qonnx_make_model(graph_resize, **onnx_kwargs) + idict = {node.input[0]: inp_values, "scales": scales_val} + sess = rt.InferenceSession(model_resize.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index bd5bb75f1d..ef80b24a2e 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,27 +28,23 @@ import math import numpy as np -import os +import onnx.numpy_helper as np_helper import textwrap import warnings from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string -class VectorVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" +class VVAU(HWCustomOp): + """Abstraction layer for HW implementation of VectorVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -71,11 +67,16 @@ def get_nodeattr_types(self): # no-activation mode (produce accumulators) "noActivation": ("i", False, 0, {0, 1}), # memory mode for the layer weights - # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights with weight streamer packaged inside IP + # internal_embedded -- embedded weights, long compile/synth times + # internal_decoupled -- default, streaming weights with streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), - # (mem_mode = decoupled only) whether weights will be writable through + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled", "external"}, + ), + # (mem_mode = internal_decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory @@ -85,7 +86,7 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # FPGA resource type for memories in decoupled mode + # FPGA resource type for memories in internal_decoupled mode # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM @@ -104,123 +105,66 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def minimize_accumulator_width(self, model): - """Minimize the accumulator bit width according to the weight values, - input data types, and size of dot product""" - weights = model.get_initializer(self.onnx_node.input[1]) - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - # put weights into the shape expected by calculate_matvec_accumulator_range - weights = weights.reshape(fm, k_h * k_w).transpose() - # since in the calculation the values of the weight matrix are used, - # for the bipolar case they need to be converted to bipolar - if self.get_nodeattr("binaryXnorMode"): - weights = 2 * weights - 1 - if len(self.onnx_node.input) > 2: - thresholds = model.get_initializer(self.onnx_node.input[2]) - else: - thresholds = None - idt = self.get_input_datatype() - - (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) - # if runtime-writeable weights, then the values of the weights can - # change and we need to use the worst-case values from the datatypes - if self.get_nodeattr("runtime_writeable_weights"): - wdt = self.get_weight_datatype() - lower_worst = wdt.min() * np.ones_like(weights) - lower_range = calculate_matvec_accumulator_range(lower_worst, idt) - upper_worst = wdt.max() * np.ones_like(weights) - upper_range = calculate_matvec_accumulator_range(upper_worst, idt) - acc_min = min(min(lower_range), min(upper_range)) - acc_max = max(max(upper_range), max(upper_range)) - - # if the thresholds can be used to determine range, then adjust the range - # according to the known values of the thresholds - if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - # set threshold datatype (and accumulator datatype implicitly) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - # clip threshold values - if max_threshold > acc_max or min_threshold < acc_min: - warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) - thresholds = np.clip(thresholds, acc_min, acc_max) - model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - acc_min = min(min_threshold, acc_min) - acc_max = max(max_threshold, acc_max) + def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels): + W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32) + for ch in range(channels): + W_sparse[ch][ch] = W_conv[ch][0] + W_conv = W_sparse.astype(np.float32) + W_matmul = W_conv.transpose(0, 2, 3, 1) + W_matmul = W_matmul.reshape(channels, channels * k_h * k_w) + W_matmul = W_matmul.T + return W_matmul - # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 - if acc_min >= 0: - acc_bit_width = np.log2(acc_max + 1) - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"UINT{acc_bit_width}"] - # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= - # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + (_, dim_h, dim_w, _) = in_act.shape + (k_h, k_w) = self.get_nodeattr("Kernel") + channels = self.get_nodeattr("Channels") + producer = [x for x in graph.node if x.output[0] == node.input[0]] + if bool(producer) and ( + producer[0].op_type == "Im2Col" or producer[0].op_type == "ConvolutionInputGenerator" + ): + pe = channels else: - _acc_max = max(-acc_min, 1 + acc_max) - acc_bit_width = np.log2(_acc_max) + 1 - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"INT{acc_bit_width}"] - - # if activation, assert that the thresholds can be expressed with adt - if thresholds is not None: - assert np.vectorize(adt.allowed)( - threshold_tensor - ).all(), "Thresholds in %s can't be expressed with type %s" % ( - self.onnx_node.name, - str(adt), - ) - - # if no activation, output and accumulator datatypes are the same - if self.get_nodeattr("noActivation"): - # if this is the last node in the graph, then ensure the datatype is - # divisibly by 8 bits - if model.find_direct_successors(self.onnx_node) is None: - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] - # for no-activation nodes, output dt = acc dt - self.set_nodeattr("outputDataType", adt.name) - self.set_nodeattr("accDataType", adt.name) + pe = self.get_nodeattr("PE") - return DataType[self.get_nodeattr("accDataType")] + # Reorder the input activations. Note that PE gets interleaved by the SWG, + # so we have to untangle and for simplicity of computation assume pe=1. + # Note that PE has no effect on the QONNX node + in_act = in_act.reshape(1, dim_h, dim_w, channels // pe, k_h * k_w, pe) + in_act = in_act.transpose(0, 1, 2, 4, 3, 5) + in_act = in_act.reshape(1, dim_h, dim_w, channels * k_h * k_w) + # Reshape weights in appropriate format + vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + vvau_w = np_helper.to_array(vvau_w_init) + vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels) - def minimize_weight_bit_width(self, model): - """Minimize the bit width based on the values of the weights""" - if not self.get_nodeattr("runtime_writeable_weights"): - weights = model.get_initializer(self.onnx_node.input[1]) - w_min = weights.min() - w_max = weights.max() - if w_min < 0: - if abs(w_min) > w_max: - wdt = DataType.get_smallest_possible(w_min) - else: - wdt = DataType.get_smallest_possible(-w_max - 1) - else: - wdt = DataType.get_smallest_possible(w_max) - self.set_nodeattr("weightDataType", wdt.name) - return DataType[self.get_nodeattr("weightDataType")] + if ( + self.get_nodeattr("inputDataType") == "BIPOLAR" + and self.get_nodeattr("weightDataType") == "BIPOLAR" + ): + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format + result = (result + k_h * k_w) / 2 + else: + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format - def calc_wmem(self): - """Calculates and returns WMEM.""" - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = (k_h * k_w * ch // pe) // simd - return wmem + if self.get_nodeattr("noActivation") == 0: + vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + vvau_thr = np_helper.to_array(vvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + # NHWC to NCHW for multithreshold node + result = result.transpose((0, 3, 1, 2)) + result = multithreshold(result, vvau_thr, out_scale, out_bias) + # NCHW to NHWC + result = result.transpose((0, 2, 3, 1)) + + context[node.output[0]] = result - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 - else: - ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - return ch // pe + def verify_node(self): + pass def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() @@ -241,12 +185,16 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) - def verify_node(self): - pass - def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") def get_weight_datatype(self): """Returns FINN DataType of weights.""" @@ -261,17 +209,37 @@ def get_output_datatype(self, ind=0): return DataType[self.get_nodeattr("outputDataType")] def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() + i_bits = self.get_input_datatype(ind).bitwidth() simd = self.get_nodeattr("SIMD") pe = self.get_nodeattr("PE") in_width = i_bits * simd * pe return in_width + def get_weightstream_width(self): + """Returns weight stream width. Used only in internal_decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "internal_decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = simd * pe * wp + return w_width + else: + return 0 + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in internal_decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") dim_h, dim_w = self.get_nodeattr("Dim") @@ -320,147 +288,384 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_exp_cycles(self): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") + def calc_wmem(self): + """Calculates and returns WMEM.""" ch = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") k_h, k_w = self.get_nodeattr("Kernel") - # currently FINN supports for vvau a batch size of 1 - batch_size = 1 - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv - return int(exp_cycles) - - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" - - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str - - return ret - - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - ch, - 1, - k_h, - k_w, - ), """Weights matrix doesn't - have expected shape (channels, 1, kernel_size, kernel_size)""" - ret = orig_weight_matrix - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - ret = ret.reshape(ch, k_h * k_w) - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - ret = ret.reshape(1, pe, wmem, simd) - return ret - - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for bipolar weights&inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - tmem = self.calc_tmem() - assert ch % pe == 0, "Requirement Channels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - if inp_is_bipolar and wt_is_bipolar: - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() - ret = orig_thres_matrix - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (ch, 1)) - assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - - def make_weight_file(self, weights, weight_file_mode, weight_file_name): - """Produce a file containing given weights in appropriate format for this - layer. This file can be used for either synthesis or run-time reconfig - of weights. + wmem = (k_h * k_w * ch // pe) // simd + return wmem - Arguments: + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + return ch // pe - * weights : numpy array with weights to be put into the file + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle != "ultra") + or (mmode == "internal_embedded") + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + + def bram_estimation(self): + """Calculates resource estimation for BRAM""" + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # since this is HLS memory, not using the full width of a BRAM + # assuming memories up to 128 deep get implemented in LUTs + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle in ["distributed", "ultra"]) + or (mstyle == "auto" and self.calc_wmem() <= 128) + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) + + def bram_efficiency_estimation(self): + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * P * omega + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = int(np.prod(self.get_nodeattr("Kernel"))) + D_out = self.get_nodeattr("Channels") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "internal_decoupled" and mstyle == "distributed") or ( + mmode == "internal_embedded" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + acc_bits = acc_datatype.bitwidth() + k_h, k_w = self.get_nodeattr("Kernel") + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + # TODO - add 'ram_style_threshold' node attribute + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + k_h, k_w = self.get_nodeattr("Kernel") + # currently FINN supports for vvau a batch size of 1 + batch_size = 1 + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv + return int(exp_cycles) + + def minimize_accumulator_width(self, model): + """Minimize the accumulator bit width according to the weight values, + input data types, and size of dot product""" + weights = model.get_initializer(self.onnx_node.input[1]) + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + # put weights into the shape expected by calculate_matvec_accumulator_range + weights = weights.reshape(fm, k_h * k_w).transpose() + # since in the calculation the values of the weight matrix are used, + # for the bipolar case they need to be converted to bipolar + if self.get_nodeattr("binaryXnorMode"): + weights = 2 * weights - 1 + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + else: + thresholds = None + idt = self.get_input_datatype() + + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + # if runtime-writeable weights, then the values of the weights can + # change and we need to use the worst-case values from the datatypes + if self.get_nodeattr("runtime_writeable_weights"): + wdt = self.get_weight_datatype() + lower_worst = wdt.min() * np.ones_like(weights) + lower_range = calculate_matvec_accumulator_range(lower_worst, idt) + upper_worst = wdt.max() * np.ones_like(weights) + upper_range = calculate_matvec_accumulator_range(upper_worst, idt) + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + + # if the thresholds can be used to determine range, then adjust the range + # according to the known values of the thresholds + if thresholds is not None: + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) + # set threshold datatype (and accumulator datatype implicitly) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + # clip threshold values + if max_threshold > acc_max or min_threshold < acc_min: + warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) + thresholds = np.clip(thresholds, acc_min, acc_max) + model.set_initializer(self.onnx_node.input[2], thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + acc_min = min(min_threshold, acc_min) + acc_max = max(max_threshold, acc_max) + + # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 + if acc_min >= 0: + acc_bit_width = np.log2(acc_max + 1) + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"UINT{acc_bit_width}"] + # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= + # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + else: + _acc_max = max(-acc_min, 1 + acc_max) + acc_bit_width = np.log2(_acc_max) + 1 + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"INT{acc_bit_width}"] + + # if activation, assert that the thresholds can be expressed with adt + if thresholds is not None: + assert np.vectorize(adt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(adt), + ) + + # if no activation, output and accumulator datatypes are the same + if self.get_nodeattr("noActivation"): + # if this is the last node in the graph, then ensure the datatype is + # divisibly by 8 bits + if model.find_direct_successors(self.onnx_node) is None: + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + self.set_nodeattr("accDataType", adt.name) + + return DataType[self.get_nodeattr("accDataType")] + + def minimize_weight_bit_width(self, model): + """Minimize the bit width based on the values of the weights""" + if not self.get_nodeattr("runtime_writeable_weights"): + weights = model.get_initializer(self.onnx_node.input[1]) + w_min = weights.min() + w_max = weights.max() + if w_min < 0: + if abs(w_min) > w_max: + wdt = DataType.get_smallest_possible(w_min) + else: + wdt = DataType.get_smallest_possible(-w_max - 1) + else: + wdt = DataType.get_smallest_possible(w_max) + self.set_nodeattr("weightDataType", wdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + tmem = self.calc_tmem() + assert ch % pe == 0, "Requirement Channels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + if inp_is_bipolar and wt_is_bipolar: + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (ch, 1)) + assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def get_hw_compatible_weight_tensor(self, orig_weight_matrix): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + ch, + 1, + k_h, + k_w, + ), """Weights matrix doesn't + have expected shape (channels, 1, kernel_size, kernel_size)""" + ret = orig_weight_matrix + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + ret = ret.reshape(ch, k_h * k_w) + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + ret = ret.reshape(1, pe, wmem, simd) + return ret + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights in appropriate format for this + layer. This file can be used for either synthesis or run-time reconfig + of weights. + + Arguments: + + * weights : numpy array with weights to be put into the file * weight_file_mode : one of {hls_header, decoupled_verilog_dat, decoupled_runtime} * weight_file_name : filename for the weight file to be generated """ # convert weights into hlslib-compatible format - weight_tensor = self.get_hls_compatible_weight_tensor(weights) + weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation @@ -490,13 +695,15 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): f_weights.write(weight_hls_code) f_weights.close() elif "decoupled" in weight_file_mode: - # create a weight stream for various flavors of decoupled mode: + # create a weight stream for various flavors of internal_decoupled mode: # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # reverse SIMD flip for saving weights in .npy weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) # PE flip for saving weights in .dat weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # SIMD & PE flip + weight_tensor_pe_simd_flipped = np.flip(weight_tensor_pe_flipped, axis=-1) # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -506,19 +713,32 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): # flipped weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + # SIMD & PE flipped + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.reshape(1, -1, pe * simd) + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.copy() if weight_file_mode == "decoupled_npy": # save weight stream into npy for cppsim - np.save(weight_file_name, weight_tensor_simd_flipped) + if self.onnx_node.op_type == "VVAU_rtl": + weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd) + weight_tensor_unflipped = weight_tensor_unflipped.copy() + np.save(weight_file_name, weight_tensor_unflipped) + else: + np.save(weight_file_name, weight_tensor_simd_flipped) elif weight_file_mode == "decoupled_verilog_dat": # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" - ) + if self.onnx_node.op_type == "VVAU_rtl": + weight_arr = pack_innermost_dim_as_hex_string( + weight_tensor_pe_simd_flipped, export_wdt, weight_width_padded, prefix="" + ) + else: + weight_arr = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_arr.flatten() weight_stream = weight_stream.copy() with open(weight_file_name, "w") as f: for val in weight_stream: @@ -555,22 +775,22 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": + if mem_mode == "internal_embedded": # save hlslib-compatible weights in params.h weight_filename = "{}/params.h".format(code_gen_dir) self.make_weight_file(weights, "hls_header", weight_filename) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": weight_filename_sim = "{}/weights.npy".format(code_gen_dir) - # save decoupled weights for cppsim + # save internal_decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # also save weights as Verilog .dat file # This file will be ignored when synthesizing UltraScale memory. weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -578,7 +798,7 @@ def generate_params(self, model, path): if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] @@ -610,384 +830,57 @@ def generate_params(self, model, path): odt_hls = export_odt.get_hls_datatype_str() f_thresh.write( "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ - = ".format( - self.calc_tmem(), - self.get_nodeattr("PE"), - threshold_tensor.shape[-1], - tdt_hls, - odt_hls, - self.get_nodeattr("ActVal"), - "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls), - ) - ) - f_thresh.write(thresholds_hls_code) - f_thresh.close() - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for VectorVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - dim_h, dim_w = self.get_nodeattr("Dim") - num_w_reps = dim_h * dim_w - - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - if self.calc_tmem() != 0: - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - dim_h, dim_w = self.get_nodeattr("Dim") - numReps = 1 * dim_h * dim_w - k_h, k_w = self.get_nodeattr("Kernel") - innerProdDim = k_h * k_w - mem_mode = self.get_nodeattr("mem_mode") - - self.code_gen_dict["$DEFINES$"] = [ - """#define Channels1 {}\n #define InnerProdDim {}\n - #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( - self.get_nodeattr("Channels"), - innerProdDim, - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", - } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - "Vector_Vector_Activate_Stream_Batch", - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + threshold_tensor.shape[-1], + tdt_hls, + odt_hls, + self.get_nodeattr("ActVal"), + "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls), + ) ) - ] - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) + f_thresh.write(thresholds_hls_code) + f_thresh.close() - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) + def get_op_and_param_counts(self): + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_repetitions = int(dim_h * dim_w) + mac_count = k_h * k_w * fm * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = k_h * k_w * fm + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = fm + ret_dict[thres_param_type] = thres_count + return ret_dict - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["internal_decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() @@ -995,7 +888,7 @@ def get_verilog_top_module_intf_names(self): sname = self.hls_sname() if mem_mode == "external": intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: @@ -1006,7 +899,7 @@ def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if self.get_nodeattr("ram_style") == "ultra": assert ( @@ -1030,11 +923,9 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # Instantiate either the HLS or RTL IP depending on operator + self.instantiate_ip(cmd) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -1103,212 +994,9 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + elif mem_mode == "internal_embedded" or mem_mode == "external": + # base class impl sufficient for internal_embedded/external modes + self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") return cmd - - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const") - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM""" - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # since this is HLS memory, not using the full width of a BRAM - # assuming memories up to 128 deep get implemented in LUTs - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mstyle == "auto" and self.calc_wmem() <= 128) - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - - if mem_width == 1: - return math.ceil(omega / 16384) - elif mem_width == 2: - return math.ceil(omega / 8192) - elif mem_width <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) - elif mem_width <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) - elif mem_width <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) - else: - return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) - - def bram_efficiency_estimation(self): - P = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * P * omega - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_datatype = self.get_accumulator_datatype() - acc_bits = acc_datatype.bitwidth() - k_h, k_w = self.get_nodeattr("Kernel") - # if accDataType is not set, then it will default to INT32, which would - # be a large overestimate in most (if not all) cases. In this scenario, - # we would use the minimum accumulator as determined by the data types - # bound, derived in https://arxiv.org/abs/2301.13376 - alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) - acc_bits = min( - acc_datatype.bitwidth(), - np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), - ) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - # TODO - add 'ram_style_threshold' node attribute - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if ( - self.get_nodeattr("mem_mode") == "decoupled" - or self.get_nodeattr("mem_mode") == "external" - ): - simd = self.get_nodeattr("SIMD") - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - w_width = simd * pe * wp - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_op_and_param_counts(self): - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_repetitions = int(dim_h * dim_w) - mac_count = k_h * k_w * fm * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = k_h * k_w * fm - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = fm - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/qnn-data/build_dataflow/build.py b/src/finn/qnn-data/build_dataflow/build.py index 0d9d55a086..13d58d2c91 100644 --- a/src/finn/qnn-data/build_dataflow/build.py +++ b/src/finn/qnn-data/build_dataflow/build.py @@ -43,6 +43,7 @@ mvau_wwidth_max=10000, # can specify detailed folding/FIFO/etc config with: # folding_config_file="folding_config.json", + specialize_layers_config_file="specialize_layers_config.json", synth_clk_period_ns=10.0, board=platform_name, shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, diff --git a/src/finn/qnn-data/build_dataflow/folding_config.json b/src/finn/qnn-data/build_dataflow/folding_config.json index 95167f1a30..46f1d6236d 100644 --- a/src/finn/qnn-data/build_dataflow/folding_config.json +++ b/src/finn/qnn-data/build_dataflow/folding_config.json @@ -1,30 +1,30 @@ { "Defaults": {}, - "Thresholding_Batch_0": { + "Thresholding_hls_0": { "PE": 49, "ram_style": "distributed" }, - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MatrixVectorActivation_1": { + "MVAU_hls_1": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_2": { + "MVAU_hls_2": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_3": { + "MVA_hls_3": { "PE": 10, "SIMD": 8, "ram_style": "distributed" }, - "LabelSelect_Batch_0": { + "LabelSelect_hls_0": { "PE": 1 } } diff --git a/src/finn/qnn-data/build_dataflow/specialize_layers_config.json b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json new file mode 100644 index 0000000000..c2a8bd4553 --- /dev/null +++ b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json @@ -0,0 +1,30 @@ +{ + "Defaults": {}, + "Thresholding_0": { + "preferred_impl_style": "hls" + }, + "MVAU_0": { + "preferred_impl_style": "hls" + }, + "Thresholding_1": { + "preferred_impl_style": "" + }, + "MVAU_1": { + "preferred_impl_style": "" + }, + "Thresholding_2": { + "preferred_impl_style": "" + }, + "MVAU_2": { + "preferred_impl_style": "" + }, + "Thresholding_3": { + "preferred_impl_style": "rtl" + }, + "MVAU_3": { + "preferred_impl_style": "" + }, + "LabelSelect_0": { + "preferred_impl_style": "hls" + } +} diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json index 442ea72d9a..498d329ba3 100644 --- a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json +++ b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json @@ -1,30 +1,30 @@ { "Defaults": {}, - "Thresholding_Batch_0": { + "Thresholding_hls_0": { "PE": 49, "ram_style": "distributed" }, - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MatrixVectorActivation_1": { + "MVAU_hls_1": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MatrixVectorActivation_2": { + "MVAU_hls_2": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MatrixVectorActivation_3": { + "MVAU_hls_3": { "PE": 10, "SIMD": 8, "ram_style": "distributed" }, - "LabelSelect_Batch_0": { + "LabelSelect_hls_0": { "PE": 1 } } diff --git a/src/finn/transformation/fpgadataflow/annotate_cycles.py b/src/finn/transformation/fpgadataflow/annotate_cycles.py index 7befad7aa7..6646434bdf 100644 --- a/src/finn/transformation/fpgadataflow/annotate_cycles.py +++ b/src/finn/transformation/fpgadataflow/annotate_cycles.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,7 @@ from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class AnnotateCycles(Transformation): @@ -46,7 +47,7 @@ def apply(self, model): graph = model.graph # annotate node cycles for node in graph.node: - if _is_fpgadataflow_node(node): + if is_hls_node(node) or is_rtl_node(node): op_inst = registry.getCustomOp(node) cycles = op_inst.get_exp_cycles() op_inst.set_nodeattr("cycles_estimate", cycles) diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index bb5637f7d3..f07a5186d5 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.analysis.fpgadataflow.post_synth_res import post_synth_res from finn.analysis.fpgadataflow.res_estimation import res_estimation -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node class AnnotateResources(Transformation): @@ -68,7 +69,7 @@ def apply(self, model): children_dict = {} # annotate node resources for node in graph.node: - if _is_fpgadataflow_node(node) and node.name in self.res_dict.keys(): + if is_fpgadataflow_node(node) and node.name in self.res_dict.keys(): op_inst = registry.getCustomOp(node) op_inst.set_nodeattr("res_" + self.mode, str(self.res_dict[node.name])) children_dict[node.name] = self.res_dict[node.name] diff --git a/src/finn/transformation/fpgadataflow/cleanup.py b/src/finn/transformation/fpgadataflow/cleanup.py index 398580c48e..907b65eb9d 100644 --- a/src/finn/transformation/fpgadataflow/cleanup.py +++ b/src/finn/transformation/fpgadataflow/cleanup.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -53,7 +54,7 @@ def apply(self, model): model.set_metadata_prop("vivado_stitch_proj", "") for node in model.graph.node: op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py index e93a8ec307..6190560265 100644 --- a/src/finn/transformation/fpgadataflow/compile_cppsim.py +++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,7 +30,7 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node class CompileCppSim(NodeLocalTransformation): @@ -50,7 +51,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py similarity index 86% rename from src/finn/transformation/fpgadataflow/convert_to_hls_layers.py rename to src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index ef02453498..897d714bf8 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,9 +44,8 @@ class InferConvInpGen(Transformation): """Convert Im2Col layers to ConvolutionInputGenerator layers.""" - def __init__(self, use_rtl_variant=False): + def __init__(self): super().__init__() - self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -70,7 +69,6 @@ def apply(self, model): pad_h = pad_attr[0] + pad_attr[2] pad_w = pad_attr[1] + pad_attr[3] dilation_h, dilation_w = i2c_inst.get_nodeattr("dilations") - # temporary checks until non-square conv support is finalized pad_val = i2c_inst.get_nodeattr("pad_value") depthwise = i2c_inst.get_nodeattr("depthwise") ifm_ch = i2c_in_shape[-1] @@ -86,9 +84,6 @@ def apply(self, model): ConvInpGen_idim_w = ifm_dim_w if pad_h > 0 or pad_w > 0: - # if padding enabled, ensure pad_val supported by DataType - # assert dt.allowed(pad_val),"""FMPadding_Batch DataType - # must support pad_val""" assert pad_val == 0, ( "%s : FMPadding_Batch doesn't currently support pad_val!= 0" % n.name ) @@ -110,10 +105,8 @@ def apply(self, model): ConvInpGen_idim_h = odim_padding_h ConvInpGen_idim_w = odim_padding_w - padding_optype = "FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch" - padding_node = helper.make_node( - padding_optype, + "FMPadding", [i2c_input], [padding_out], domain="finn.custom_op.fpgadataflow", @@ -129,15 +122,37 @@ def apply(self, model): is_kernel_pointwise = k_h == 1 and k_w == 1 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w - is_square_kernel = k_h == k_w is_equal_stride = stride_h == stride_w - is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( - k_h > 1 and k_w == 1 and ifm_dim_w == 1 - ) - if self.use_rtl_variant: + is_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) + if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: + downsample_1D = is_1D + is1D_unitx = ifm_dim_w == 1 + downsample_2D = (not downsample_1D) and is_square_image and is_equal_stride + if not (downsample_1D or downsample_2D): + warnings.warn(f"Couldn't infer Downsample from {n.name},check config.") + continue + ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) + stride = max(stride_h, stride_w) + # create DownSampler node + ConvInpGen_node = helper.make_node( + "DownSampler", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ImgDim=ConvInpGen_idim, + NumChannels=ifm_ch, + SIMD=ifm_ch, + Stride=stride, + inputDataType=dt.name, + name="DownSampler_" + n.name, + is1D=downsample_1D, + is1D_unitx=is1D_unitx, + ) + else: ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator_rtl", + "ConvolutionInputGenerator", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", @@ -147,106 +162,15 @@ def apply(self, model): IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], OFMDim=[ofm_dim_h, ofm_dim_w], SIMD=ifm_ch, - M=1, - parallel_window=0, Stride=[stride_h, stride_w], Dilation=[dilation_h, dilation_w], inputDataType=dt.name, outputDataType=dt.name, depthwise=depthwise, - name="ConvolutionInputGenerator_rtl_" + n.name, + is1D=is_1D, + name="ConvolutionInputGenerator_" + n.name, ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) - else: - # Ensure that only supported HLS nodes are inserted - if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) - is1D_unitx = ifm_dim_w == 1 - downsample_2D = (not downsample_1D) and is_square_image and is_equal_stride - if not (downsample_1D or downsample_2D): - warnings.warn(f"Couldn't infer Downsample from {n.name},check config.") - continue - ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) - stride = max(stride_h, stride_w) - # create DownSampler node - ConvInpGen_node = helper.make_node( - "DownSampler", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ImgDim=ConvInpGen_idim, - NumChannels=ifm_ch, - SIMD=ifm_ch, - Stride=stride, - inputDataType=dt.name, - name="DownSampler_" + n.name, - is1D=downsample_1D, - is1D_unitx=is1D_unitx, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) - else: - # create equivalent ConvolutionInputGenerator node - if is_square_image and is_square_kernel: # square images and square kernels - assert is_equal_stride, ( - """%s: Non-equal strides along different axes is not supported - for (non-)square convolutions""" - % n.name - ) - assert dilation_h == 1 and dilation_w == 1, ( - """%s: Dilation value != 1 is not supported - for square convolutions""" - % n.name - ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator_" + n.name, - ) - else: # 1D images and/or kernels - assert is_1d_convolution, ( - """%s: ConvolutionInputGenerator1D works only - for 1D convs""" - % n.name - ) - if dilation_h > 1 or dilation_w > 1: - assert depthwise == 1, ( - """%s: Dilation value > 1 is only supported for - 1D depthwise separable convolutions""" - % n.name - ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator1D", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator1D_" + n.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -256,10 +180,96 @@ def apply(self, model): return (model, graph_modified) +class InferThresholdingLayer(Transformation): + """Convert any MultiThreshold into a standalone thresholding HLS layer.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "MultiThreshold": + thl_input = node.input[0] + thl_threshold = node.input[1] + thl_output = node.output[0] + thl_in_shape = model.get_tensor_shape(thl_input) + thl_thres_shape = model.get_tensor_shape(thl_threshold) + idt = model.get_tensor_datatype(thl_input) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + thl_in_layout = model.get_tensor_layout(thl_input) + if thl_in_layout == DataLayout.NCHW: + thl_input = nchw_to_nhwc(thl_input, model, node_ind) + node_ind += 1 + thl_in_shape = model.get_tensor_shape(thl_input) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + thl_output_layout = model.get_tensor_layout(thl_output) + if thl_output_layout == DataLayout.NCHW: + thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume number of channels is in last dimension + ifc = int(thl_in_shape[-1]) + # create node with no parallelization first + pe = 1 + + odt = model.get_tensor_datatype(thl_output) + scale = getCustomOp(node).get_nodeattr("out_scale") + assert scale == 1.0, ( + node.name + ": MultiThreshold out_scale must be 1 for HLS conversion." + ) + actval = getCustomOp(node).get_nodeattr("out_bias") + assert int(actval) == actval, ( + node.name + ": MultiThreshold out_bias must be integer for HLS conversion." + ) + actval = int(actval) + + # a signed activation should always have a negative bias, + # but BIPOLAR uses the -1 as 0 encoding so the assert does not apply + if odt != DataType["BIPOLAR"]: + assert (not odt.signed()) or (actval < 0), ( + node.name + ": Signed output requires actval < 0" + ) + + new_node = helper.make_node( + "Thresholding", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, + name="Thresholding_" + node.name, + ) + + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + return (model, graph_modified) + + class InferUpsample(Transformation): - """ - Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour_Batch nodes. - """ + """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" def apply(self, model): graph = model.graph @@ -323,7 +333,7 @@ def apply(self, model): "%s: Upsampling is only supported for 1D H or 2D square inputs." % n.name ) - # Extract information for HLS node + # Extract information for HW node IFMDim = in_shape[1] OFMDim = int(round(in_shape[1] * spatial_scale)) NumChannels = in_shape[-1] @@ -331,9 +341,9 @@ def apply(self, model): inputDataType = dt.name dim_mode = 0 if is_shape_square_2d else 1 - # Insert the HLSCustomOp node - Upsample_HLS_node = helper.make_node( - "UpsampleNearestNeighbour_Batch", + # Insert the HWCustomOp node + Upsample_HW_node = helper.make_node( + "UpsampleNearestNeighbour", [n.input[0]], [n.output[0]], domain="finn.custom_op.fpgadataflow", @@ -344,11 +354,11 @@ def apply(self, model): inputDataType=inputDataType, numInputVectors=numInputVectors, DimMode=dim_mode, - name="UpsampleNearestNeighbour_Batch_" + n.name, + name="UpsampleNearestNeighbour_" + n.name, ) # Remove the old node - graph.node.insert(node_ind, Upsample_HLS_node) + graph.node.insert(node_ind, Upsample_HW_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -356,7 +366,7 @@ def apply(self, model): class InferStreamingMaxPool(Transformation): - """Convert MaxPoolNHWC layers to StreamingMaxPool layers.""" + """Convert MaxPoolNHWC layers to StreamingMaxPool HW layers.""" def apply(self, model): graph = model.graph @@ -368,10 +378,15 @@ def apply(self, model): mp_input = node.input[0] mp_output = node.output[0] mp_in_shape = model.get_tensor_shape(mp_input) - # mp_out_shape = model.get_tensor_shape(mp_output) dt = model.get_tensor_datatype(mp_input) mp_inst = getCustomOp(node) k_h, k_w = mp_inst.get_nodeattr("kernel_shape") + s_h, s_w = mp_inst.get_nodeattr("strides") + if k_h != s_h or k_w != s_w: + warn_str = """Stride is not equal to kernel. Node cannot be converted to + StreamingMaxPool layer.""" + warnings.warn(warn_str) + continue ifm_ch = mp_in_shape[-1] ifm_dim_h = mp_in_shape[1] ifm_dim_w = mp_in_shape[2] @@ -383,9 +398,9 @@ def apply(self, model): pass_1d = is_1d and (not is_bipolar) pass_2d = (not is_1d) and is_divisable if pass_1d or pass_2d: - # create equivalent StreamingMaxPool_Batch node + # create equivalent StreamingMaxPool node new_node = helper.make_node( - "StreamingMaxPool_Batch", + "StreamingMaxPool", [mp_input], [mp_output], domain="finn.custom_op.fpgadataflow", @@ -396,24 +411,22 @@ def apply(self, model): dataType=dt.name, PE=pe, CeilMode=ceil_mode, - name="StreamingMaxPool_Batch_" + node.name, + name="StreamingMaxPool_" + node.name, ) graph.node.insert(node_ind, new_node) # remove old nodes graph.node.remove(node) graph_modified = True else: - warnings.warn(node.name + ": could not convert to HLS") + warnings.warn(node.name + ": could not convert to HW") if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferPool_Batch(Transformation): - """If kernel_shape > strides, replace Pool layer with with of Im2col - + pool(with kernel_shape == strides), plus Transpose layers to keep the original - data layout.""" +class InferAddStreamsLayer(Transformation): + """Convert any Add into a AddStreams HW layer.""" def apply(self, model): graph = model.graph @@ -421,607 +434,376 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]: - node_input = node.input[0] - ishape = model.get_tensor_shape(node_input) - node_output = node.output[0] - idt = model.get_tensor_datatype(node_input) - oshape = model.get_tensor_shape(node_output) - # only support 4D input tensors (1D convs need extra dummy dim) - if len(ishape) != 4: + if node.op_type == "Add": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + in0_static = not (model.get_initializer(in0) is None) + in1_static = not (model.get_initializer(in1) is None) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + # skip if any of inputs have initializers + # (this node is meant for adding two dynamic streams) + if in0_static or in1_static: continue - # extract pool parameters - if node.op_type == "MaxPool": - kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints) - sh, sw = list(get_by_name(node.attribute, "strides").ints) - dlayout = "NCHW" - elif node.op_type == "QuantAvgPool2d": - inst = getCustomOp(node) - # QuantAvgPool2d has a single scalar attribute - # for kernel size and stride (implicit square) - kh = kw = inst.get_nodeattr("kernel") - sh = sw = inst.get_nodeattr("stride") - dlayout = inst.get_nodeattr("data_layout") - elif node.op_type == "MaxPoolNHWC": - inst = getCustomOp(node) - kh, kw = inst.get_nodeattr("kernel_shape") - sh, sw = inst.get_nodeattr("strides") - dlayout = "NHWC" - try: - pad = list(get_by_name(node.attribute, "pads").ints) - except AttributeError: - pad = [0, 0, 0, 0] + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) - if not idt.is_integer(): + # skip if different data types on inputs + if idt0 != idt1: continue - if (kh < sh) or (kw < sw): - # TODO check/implement swg support + idt = idt0 + + # skip conversion for layers with float input + if not idt.is_integer(): continue - odt = model.get_tensor_datatype(node_output) + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) - if dlayout == "NCHW": - _, ifm_ch, ifm_h, ifm_w = ishape - _, ofm_ch, ofm_h, ofm_w = oshape - elif dlayout == "NHWC": - _, ifm_h, ifm_w, ifm_ch = ishape - _, ofm_h, ofm_w, ofm_ch = oshape - else: - raise Exception("Unknown dlayout: " + str(dlayout)) + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) - # if data layout NCHW, we need transpose nodes surrounding - # the hls layer - if dlayout == "NCHW": - # create new intermediate values - inp_trans_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ifm_h, ifm_w, ifm_ch), # NHWC - ) - graph.value_info.append(inp_trans_out) - inp_trans_out = inp_trans_out.name - model.set_tensor_datatype(inp_trans_out, idt) + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) - pool_output = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ofm_h, ofm_w, ofm_ch), - ) - graph.value_info.append(pool_output) - pool_output = pool_output.name - # model.set_tensor_datatype(pool_output, odt) + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind - im2col_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ofm_h, ofm_w, ifm_ch * kh * kw), - ) - graph.value_info.append(im2col_out) - im2col_out = im2col_out.name - model.set_tensor_datatype(im2col_out, idt) + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 - # create new nodes - if dlayout == "NCHW": - # NCHW -> NHWC - inp_trans_node = helper.make_node( - "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] - ) - im2col_in = inp_trans_out - else: - im2col_in = node_input - pool_output = node_output + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 - accum_bits = 0 - pool_size_param = 0 # will be overridden if neededs - pad_value = 0 - if node.op_type in ["MaxPool", "MaxPoolNHWC"]: - pool_fxn = "MaxPool" - odt = idt - pad_value = idt.min() - elif node.op_type == "QuantAvgPool2d": - assert odt.is_integer(), """Output data type for QuantAvgPool2d - needs to be integer""" - assert all(x == 0 for x in pad), "Padding is not supported for QuantAvgPool2d" - inst = getCustomOp(node) - pool_fxn = "QuantAvgPool" - pool_size_param = inst.get_shifts() - accum_bits = inst.get_accum_size() + # create and insert new AddStreams node + new_node = helper.make_node( + "AddStreams", + [in0, in1], + [result], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType=idt.name, + numInputVectors=in0_shape[:-1], + name="AddStreams_" + node.name, + ) + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True - else: - raise Exception( - "pad_value and pool_fxn not configured for {}".format(node.op_type) + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferDuplicateStreamsLayer(Transformation): + """Insert a DuplicateStreams HW layer for any tensor with fanout == 2""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + successors = model.find_consumers(node.output[0]) + if successors is not None and len(successors) >= 2: + output_tensor = node.output[0] + n_outputs = len(successors) + + dt = model.get_tensor_datatype(output_tensor) + + # skip conversion for layers with float input + if not dt.is_integer(): + continue + + # create clone tensors + out_shape = model.get_tensor_shape(output_tensor) + out_tensor_clones = [] + for i in range(n_outputs): + clone = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape ) + model.graph.value_info.append(clone) + out_tensor_clones += [clone.name] - # format input tensor - im2col_node = helper.make_node( - "Im2Col", - [im2col_in], - [im2col_out], - domain="qonnx.custom_op.general", - stride=[sh, sw], - kernel_size=[kh, kw], - pad_amount=pad, - pad_value=pad_value, - depthwise=1, - input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch), - name="Im2Col_" + node.name, - ) + num_ch = int(out_shape[-1]) + vecs = out_shape[:-1] - # Warning PE has to be equal to ifm_ch until Im2Col is replaced by - # ConvolutionInputGenerator with depthwise=1. - # For other settings the output will be incorrect due to incorrect input - # data layout - pool_node = helper.make_node( - "Pool_Batch", - [im2col_out], - [pool_output], + # create node with no parallelization first + pe = 1 + + dup_node = helper.make_node( + "DuplicateStreams", + [output_tensor], + out_tensor_clones, domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - InputDataType=idt.name, - OutputDataType=odt.name, - Channels=ifm_ch, - PE=ifm_ch, - KernelSize=[kh, kw], - Function=pool_fxn, - OutImgDims=[ofm_h, ofm_w], - AccumBits=accum_bits, - Size=pool_size_param, - BatchSize=1, - name="Pool_Batch_" + node.name, + NumChannels=num_ch, + PE=pe, + inputDataType=dt.name, + numInputVectors=vecs, + NumOutputStreams=n_outputs, + outFIFODepths=[2] * n_outputs, + name="DuplicateStreams_" + node.name, ) - if dlayout == "NCHW": - # NHWC -> NCHW - out_trans_node = helper.make_node( - "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] - ) + graph.node.insert(node_ind, dup_node) + + # connect successors to out tensor clone + clone_idx = 0 + for successor in successors: + for i, succ_input in enumerate(successor.input): + if succ_input == output_tensor: + successor.input[i] = out_tensor_clones[clone_idx] + clone_idx += 1 + # if one node has multiple connections to the same output + # find_direct_successors will return one node per input + # so break the inner loop will result in correct behaviour + break - # insert nodes where the conv is to preserve topological ordering - if dlayout == "NCHW": - graph.node.insert(node_ind, inp_trans_node) - graph.node.insert(node_ind + 1, im2col_node) - graph.node.insert(node_ind + 2, pool_node) - graph.node.insert(node_ind + 3, out_trans_node) - else: - graph.node.insert(node_ind, im2col_node) - graph.node.insert(node_ind + 1, pool_node) - # remove old node - graph.node.remove(node) graph_modified = True if graph_modified: + model = model.transform(SortGraph()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferBinaryMatrixVectorActivation(Transformation): - """Convert XnorPopcountMatMul layers to - MatrixVectorActivation layers. Any immediately following MultiThreshold - layers will also be absorbed into the MVTU.""" +class InferChannelwiseLinearLayer(Transformation): + """Convert any channel-wise Add/Mul into a HW layer.""" - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode + def get_smallest_possible(self, vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals, dtype=np.float64) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.get_accumulator_dt_cands(): + dt = DataType[k] + + if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType["UINT64"] + else: + return DataType["INT64"] def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False - for n in graph.node: + for node in graph.node: node_ind += 1 - if n.op_type == "XnorPopcountMatMul": - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( - n.name - + """: First - input for xnorpopcount is not set to FINN DataType BINARY.""" - ) - assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( - n.name - + """: Second - input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" - ) - idt = DataType["BINARY"] - wdt = DataType["BINARY"] - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # extract weight shape, note that ONNX and finn-hlslib - # make different assumptions about dim order here - # ONNX assumes W has (in, out) shape - # finn-hlslib assumes W has (out, in) shape - mh = int(W.shape[1]) - mw = int(W.shape[0]) + if node.op_type == "Add" or node.op_type == "Mul": + # assuming input[0] is dynamic + ll_input = node.input[0] + ll_output = node.output[0] + ll_in_shape = model.get_tensor_shape(ll_input) + + # check if input 1 has an initializer + ll_const = node.input[1] + if ll_const is not None: + ll_cinit = model.get_initializer(ll_const) + if ll_cinit is None: + # input 1 is also dynamic + continue + else: + continue + + # get number of channels and channel index from input + ll_in_layout = model.get_tensor_layout(ll_input) + if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: + ch_index = -1 + ch = ll_in_shape[-1] + elif ll_in_layout == DataLayout.NCHW: + ch_index = 1 + ch = ll_in_shape[1] + else: + continue + + # check if the shape of initializer is compatible + ll_cinit_shape = list(ll_cinit.shape) + if np.prod(ll_cinit_shape) == 1: + warnings.warn("Broadcasting " + str(node.op_type) + "(" + node.name + ")") + ll_cinit = np.full((ch), ll_cinit.flatten()[0]) + elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: + # parameter shape not compatible with Channelwise + continue + + # check initializer contains integers as floats + if not (ll_cinit.astype(np.int32) == ll_cinit).all(): + continue + # all initializer conditions are met + + # check inputs + idt = model.get_tensor_datatype(ll_input) + if not idt.is_integer(): + # skip conversion for layers with float input + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + if ll_in_layout == DataLayout.NCHW: + ll_input = nchw_to_nhwc(ll_input, model, node_ind) + node_ind += 1 + ll_in_shape = model.get_tensor_shape(ll_input) + + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind + ll_output_layout = model.get_tensor_layout(ll_output) + if ll_output_layout == DataLayout.NCHW: + ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) + node_ind += 1 + + # get parameter data type + param_min = min(ll_cinit.flatten()) + param_max = max(ll_cinit.flatten()) + pdt = self.get_smallest_possible([param_min, param_max]) + + # set function and determine output data type + if node.op_type == "Add": + func = "add" + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = self.get_smallest_possible([out_min, out_max]) + elif node.op_type == "Mul": + func = "mul" + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = self.get_smallest_possible(possible_limits) + + model.set_initializer(ll_const, ll_cinit.reshape(ch)) + model.set_tensor_datatype(ll_output, odt) + # create node with no parallelization first pe = 1 - simd = 1 - wmem = mw * mh // (pe * simd) - assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisiable by - (WMEM * PE * SIMD) is violated.""" + assert ch % pe == 0, "Requirement IFC divisable by PE is violated." + # create and insert node + new_node = helper.make_node( + "ChannelwiseOp", + [ll_input, ll_const], + [ll_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + Func=func, + NumChannels=ch, + PE=pe, + inputDataType=idt.name, + paramDataType=pdt.name, + outputDataType=odt.name, + numInputVectors=list(ll_in_shape[:-1]), + name="ChannelwiseOp_" + node.name, ) - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # TODO ensure integer thresholds? - # create MVTU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor MH.""" - ) - odt = model.get_tensor_datatype(mt_output) - if odt.bitwidth() == 1: - # covers both bipolar and binary - actval = 0 - else: - actval = odt.min() - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - binaryXnorMode=1, - noActivation=0, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name=n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - binaryXnorMode=1, - noActivation=1, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name=n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferQuantizedMatrixVectorActivation(Transformation): - """Convert MatMul layers with quantized inputs and weights to - MatrixVectorActivation layers. Any immediately following MultiThreshold - layers will also be absorbed into the MVTU.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode +class InferLabelSelectLayer(Transformation): + """Convert any TopK into a LabelSelect HW layer.""" def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False - for n in graph.node: + for node in graph.node: node_ind += 1 - if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - idt = model.get_tensor_datatype(mm_input) - wdt = model.get_tensor_datatype(mm_weight) - if idt.is_integer() and wdt.is_integer(): - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # extract weight shape, note that ONNX and finn-hlslib - # make different assumptions about dim order here - # ONNX assumes W has (in, out) shape - # finn-hlslib assumes W has (out, in) shape - mh = int(W.shape[1]) - mw = int(W.shape[0]) - # create node with no parallelization first - pe = 1 - simd = 1 - wmem = mw * mh // (pe * simd) - assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisible by - (WMEM * PE * SIMD) is violated.""" - ) - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # TODO ensure integer thresholds? - # create MVTU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor MH.""" - ) - odt = model.get_tensor_datatype(mt_output) - scale = getCustomOp(consumer).get_nodeattr("out_scale") - actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert int(actval) == actval, ( - consumer.name + ": out_bias must be integer for HLS conversion." - ) - actval = int(actval) - odt_is_bipolar = odt == DataType["BIPOLAR"] - bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1) - assert scale == 1.0 or bipolar_ok, ( - consumer.name + ": out_scale=1 or bipolar output needed for conversion." - ) - assert (not odt.signed()) or (actval < 0), ( - consumer.name + ": Signed output requres actval < 0" - ) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - if bipolar_ok: - # remove bias for bipolar, since - # binary->bipolar is achieved by reinterpretation - actval = 0 - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - binaryXnorMode=0, - noActivation=0, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - binaryXnorMode=0, - noActivation=1, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True + if node.op_type == "TopK": + fc_input = node.input[0] + k_input = node.input[1] + val_output = node.output[0] + idx_output = node.output[1] + fc_in_shape = model.get_tensor_shape(fc_input) + + idt = model.get_tensor_datatype(fc_input) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # skip conversion for if value output is connected (not supported) + if model.find_consumer(val_output) is not None: + continue + + num_labels = int(fc_in_shape[-1]) + num_inp_vecs = list(fc_in_shape[:-1]) + # create node with no parallelization first + pe = 1 + + k = model.get_initializer(k_input)[0] + + # create and insert new LabelSelect node + new_node = helper.make_node( + "LabelSelect", + [fc_input], + [idx_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + Labels=num_labels, + PE=pe, + K=k, + inputDataType=idt.name, + numInputVectors=num_inp_vecs, + name="LabelSelect_" + node.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferVectorVectorActivation(Transformation): - """Convert MatMul layers with quantized inputs and weights to - VectorVectorActivation layers, if the sparsity annotation - of the weight matrix indicates that the MatMul layer belongs to - a depthwise convolution. Any immediately following MultiThreshold - layers will also be absorbed into the VVAU.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None: - sparsity = model.get_tensor_sparsity(n.input[1]) - try: - k_h, k_w = sparsity["dw"]["kernel_shape"] - except KeyError: - raise Exception( - n.name - + """: sparsity annotation doesn't indicate that MatMul - belongs to a depthwise convolution.""" - ) - - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - idt = model.get_tensor_datatype(mm_input) - wdt = model.get_tensor_datatype(mm_weight) - if idt.is_integer() and wdt.is_integer(): - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # infer dense weight tensor from sparse weight matrix - # kernel size (k_h, k_w) which was extracted above and the value of - # the channels is used. - # the weight matrix has a shape of (k_h * k_w * Channels, Channels) - # we need to reverse the creation of the sparse weight matrix - # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) - channels = int(W.shape[1]) - # transpose to achieve a shape of (k_h * k_w * Channels, Channels) - W = W.T - # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards - # to (Channels, Channels, k_h, k_w) - W = W.reshape(channels, k_h, k_w, channels) - W = W.transpose(0, 3, 1, 2) - # now we can extract the values using a for loop over the channels - # and fill a zero numpy array in the correct shape - w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) - for ch in range(channels): - w_tensor[ch][0] = W[ch][ch] - model.set_initializer(mm_weight, w_tensor) - model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) - # create node with pe=channels as default - pe = channels - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # create VVAU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == channels, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor Channels.""" - ) - odt = model.get_tensor_datatype(mt_output) - scale = getCustomOp(consumer).get_nodeattr("out_scale") - assert scale == 1.0, ( - consumer.name + ": out_scale must be equal to 1.0 for HLS conversion." - ) - actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert int(actval) == actval, ( - consumer.name + ": out_bias must be integer for HLS conversion." - ) - actval = int(actval) - assert (not odt.signed()) or (actval < 0), ( - consumer.name + ": Signed output requres actval < 0" - ) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - # create and insert new VectorVectorActivation node - new_node = helper.make_node( - "VectorVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - resType="lut", - PE=pe, - Dim=[mm_in_shape[1], mm_in_shape[2]], - Channels=channels, - Kernel=[k_h, k_w], - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - noActivation=0, - name="VectorVectorActivation_" + n.name, - mem_mode=self.mem_mode, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new VVAU node - new_node = helper.make_node( - "VectorVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - resType="lut", - PE=pe, - Dim=[mm_in_shape[1], mm_in_shape[2]], - Channels=channels, - Kernel=[k_h, k_w], - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - noActivation=1, - name="VectorVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferThresholdingLayer(Transformation): - """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode +class InferGlobalAccPoolLayer(Transformation): + """Convert any GlobalAveragePool into a GlobalAccPool HW layer and a scalar Mul.""" def apply(self, model): graph = model.graph @@ -1029,72 +811,76 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "MultiThreshold": - thl_input = node.input[0] - thl_threshold = node.input[1] - thl_output = node.output[0] - thl_in_shape = model.get_tensor_shape(thl_input) - thl_thres_shape = model.get_tensor_shape(thl_threshold) - idt = model.get_tensor_datatype(thl_input) + if node.op_type == "GlobalAveragePool": + in0 = node.input[0] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + + idt = model.get_tensor_datatype(in0) # skip conversion for layers with float input if not idt.is_integer(): continue - # check layout of inputs/outputs, and convert if needed # check layout and convert if necessary - thl_in_layout = model.get_tensor_layout(thl_input) - if thl_in_layout == DataLayout.NCHW: - thl_input = nchw_to_nhwc(thl_input, model, node_ind) + in0_layout = model.get_tensor_layout(in0) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) node_ind += 1 - thl_in_shape = model.get_tensor_shape(thl_input) + in0_shape = model.get_tensor_shape(in0) - # keep track of where we need to insert the HLS Op + # keep track of where we need to insert the HW Op # it has to be ahead of the output transform insert_point = node_ind - thl_output_layout = model.get_tensor_layout(thl_output) - if thl_output_layout == DataLayout.NCHW: - thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True) + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) node_ind += 1 - # now safe to assume number of channels is in last dimension - ifc = int(thl_in_shape[-1]) + num_ch = int(in0_shape[-1]) + vecs = in0_shape[:-1] # create node with no parallelization first pe = 1 - odt = model.get_tensor_datatype(thl_output) - scale = getCustomOp(node).get_nodeattr("out_scale") - assert scale == 1.0, ( - node.name + ": MultiThreshold out_scale must be 1 for HLS conversion." - ) - actval = getCustomOp(node).get_nodeattr("out_bias") - assert int(actval) == actval, ( - node.name + ": MultiThreshold out_bias must be integer for HLS conversion." - ) - actval = int(actval) - assert (not odt.signed()) or (actval < 0), ( - node.name + ": Signed output requres actval < 0" + # create an additional tensor of the same shape and layout as result + out_shape = model.get_tensor_shape(result) + pool_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape ) - # create and insert new Thresholding_Batch node - new_node = helper.make_node( - "Thresholding_Batch", - [thl_input, thl_threshold], - [thl_output], + model.graph.value_info.append(pool_out) + pool_out = pool_out.name + model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) + + new_pool = helper.make_node( + "GlobalAccPool", + [in0], + [pool_out], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - NumChannels=ifc, + NumChannels=num_ch, PE=pe, - numSteps=thl_thres_shape[1], inputDataType=idt.name, - # weightDataType can be tightened by MinimizeAccumulatorWidth - weightDataType=idt.name, - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - ActVal=actval, - mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, + numInputVectors=vecs, + name="GlobalAccPool_" + node.name, ) - graph.node.insert(insert_point, new_node) + + mul_value = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] + ) + model.graph.value_info.append(mul_value) + model.set_initializer( + mul_value.name, np.array(1 / (vecs[1] * vecs[2]), dtype=np.float32) + ) + new_mul = helper.make_node( + "Mul", + [pool_out, mul_value.name], + [result], + ) + graph.node.insert(insert_point, new_pool) + graph.node.insert(insert_point + 1, new_mul) + node_ind += 1 # remove old node graph.node.remove(node) graph_modified = True @@ -1105,8 +891,10 @@ def apply(self, model): return (model, graph_modified) -class InferAddStreamsLayer(Transformation): - """Convert any Add into a AddStreams HLS layer.""" +class InferPool(Transformation): + """If kernel_shape > strides, replace Pool layer with with of Im2col + + pool(with kernel_shape == strides), plus Transpose layers to keep the original + data layout.""" def apply(self, model): graph = model.graph @@ -1114,78 +902,171 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "Add": - in0 = node.input[0] - in1 = node.input[1] - result = node.output[0] - in0_shape = model.get_tensor_shape(in0) - in1_shape = model.get_tensor_shape(in1) - in0_static = not (model.get_initializer(in0) is None) - in1_static = not (model.get_initializer(in1) is None) - - # skip if different shapes on inputs - if in0_shape != in1_shape: - continue - # skip if any of inputs have initializers - # (this node is meant for adding two dynamic streams) - if in0_static or in1_static: + if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]: + node_input = node.input[0] + ishape = model.get_tensor_shape(node_input) + node_output = node.output[0] + idt = model.get_tensor_datatype(node_input) + oshape = model.get_tensor_shape(node_output) + # only support 4D input tensors (1D convs need extra dummy dim) + if len(ishape) != 4: continue - idt0 = model.get_tensor_datatype(in0) - idt1 = model.get_tensor_datatype(in1) + # extract pool parameters + if node.op_type == "MaxPool": + kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints) + sh, sw = list(get_by_name(node.attribute, "strides").ints) + dlayout = "NCHW" + elif node.op_type == "QuantAvgPool2d": + inst = getCustomOp(node) + # QuantAvgPool2d has a single scalar attribute + # for kernel size and stride (implicit square) + kh = kw = inst.get_nodeattr("kernel") + sh = sw = inst.get_nodeattr("stride") + dlayout = inst.get_nodeattr("data_layout") + elif node.op_type == "MaxPoolNHWC": + inst = getCustomOp(node) + kh, kw = inst.get_nodeattr("kernel_shape") + sh, sw = inst.get_nodeattr("strides") + dlayout = "NHWC" + try: + pad = list(get_by_name(node.attribute, "pads").ints) + except AttributeError: + pad = [0, 0, 0, 0] - # skip if different data types on inputs - if idt0 != idt1: + if not idt.is_integer(): continue - idt = idt0 - - # skip conversion for layers with float input - if not idt.is_integer(): + if (kh < sh) or (kw < sw): + # TODO check/implement swg support continue - # check layout and convert if necessary - in0_layout = model.get_tensor_layout(in0) - in1_layout = model.get_tensor_layout(in1) - result_layout = model.get_tensor_layout(result) + odt = model.get_tensor_datatype(node_output) - if in0_layout == DataLayout.NCHW: - in0 = nchw_to_nhwc(in0, model, node_ind) - node_ind += 1 - in0_shape = model.get_tensor_shape(in0) + if dlayout == "NCHW": + _, ifm_ch, ifm_h, ifm_w = ishape + _, ofm_ch, ofm_h, ofm_w = oshape + elif dlayout == "NHWC": + _, ifm_h, ifm_w, ifm_ch = ishape + _, ofm_h, ofm_w, ofm_ch = oshape + else: + raise Exception("Unknown dlayout: " + str(dlayout)) - if in1_layout == DataLayout.NCHW: - in1 = nchw_to_nhwc(in1, model, node_ind) - node_ind += 1 - in1_shape = model.get_tensor_shape(in1) + # if data layout NCHW, we need transpose nodes surrounding + # the hw layer + if dlayout == "NCHW": + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ifm_h, ifm_w, ifm_ch), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + model.set_tensor_datatype(inp_trans_out, idt) - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind + pool_output = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_h, ofm_w, ofm_ch), + ) + graph.value_info.append(pool_output) + pool_output = pool_output.name - if result_layout == DataLayout.NCHW: - result = nchw_to_nhwc(result, model, node_ind, reverse=True) - node_ind += 1 + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_h, ofm_w, ifm_ch * kh * kw), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) - # now safe to assume num_channels is size of last dimension - num_channels = int(in0_shape[-1]) - # create node with no parallelization first - pe = 1 + # create new nodes + if dlayout == "NCHW": + # NCHW -> NHWC + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + im2col_in = inp_trans_out + else: + im2col_in = node_input + pool_output = node_output - # create and insert new AddStreams_Batch node - new_node = helper.make_node( - "AddStreams_Batch", - [in0, in1], - [result], + accum_bits = 0 + pool_size_param = 0 # will be overridden if neededs + pad_value = 0 + if node.op_type in ["MaxPool", "MaxPoolNHWC"]: + pool_fxn = "MaxPool" + odt = idt + pad_value = idt.min() + elif node.op_type == "QuantAvgPool2d": + assert odt.is_integer(), """Output data type for QuantAvgPool2d + needs to be integer""" + assert all(x == 0 for x in pad), "Padding is not supported for QuantAvgPool2d" + inst = getCustomOp(node) + pool_fxn = "QuantAvgPool" + pool_size_param = inst.get_shifts() + accum_bits = inst.get_accum_size() + + else: + raise Exception( + "pad_value and pool_fxn not configured for {}".format(node.op_type) + ) + + # format input tensor + im2col_node = helper.make_node( + "Im2Col", + [im2col_in], + [im2col_out], + domain="qonnx.custom_op.general", + stride=[sh, sw], + kernel_size=[kh, kw], + pad_amount=pad, + pad_value=pad_value, + depthwise=1, + input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch), + name="Im2Col_" + node.name, + ) + + # Warning PE has to be equal to ifm_ch until Im2Col is replaced by + # ConvolutionInputGenerator with depthwise=1. + # For other settings the output will be incorrect due to incorrect input + # data layout + pool_node = helper.make_node( + "Pool", + [im2col_out], + [pool_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - NumChannels=num_channels, - PE=pe, - inputDataType=idt.name, - numInputVectors=in0_shape[:-1], - name="AddStreams_Batch_" + node.name, + InputDataType=idt.name, + OutputDataType=odt.name, + Channels=ifm_ch, + PE=ifm_ch, + KernelSize=[kh, kw], + Function=pool_fxn, + OutImgDims=[ofm_h, ofm_w], + AccumBits=accum_bits, + Size=pool_size_param, + BatchSize=1, + name="Pool_" + node.name, ) - graph.node.insert(insert_point, new_node) + + if dlayout == "NCHW": + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] + ) + + # insert nodes where the conv is to preserve topological ordering + if dlayout == "NCHW": + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, im2col_node) + graph.node.insert(node_ind + 2, pool_node) + graph.node.insert(node_ind + 3, out_trans_node) + else: + graph.node.insert(node_ind, im2col_node) + graph.node.insert(node_ind + 1, pool_node) # remove old node graph.node.remove(node) graph_modified = True @@ -1196,8 +1077,8 @@ def apply(self, model): return (model, graph_modified) -class InferDuplicateStreamsLayer(Transformation): - """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2""" +class InferLookupLayer(Transformation): + """Convert Gather nodes with constant op0 into Lookup HW layers.""" def apply(self, model): graph = model.graph @@ -1205,103 +1086,53 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - successors = model.find_consumers(node.output[0]) - if successors is not None and len(successors) >= 2: - output_tensor = node.output[0] - n_outputs = len(successors) - - dt = model.get_tensor_datatype(output_tensor) - - # skip conversion for layers with float input - if not dt.is_integer(): + if node.op_type == "Gather": + emb_name = node.input[0] + embs = model.get_initializer(emb_name) + axis = get_by_name(node.attribute, "axis") + # skip conversion if input0 is not constant + if embs is None: continue - - # create clone tensors - out_shape = model.get_tensor_shape(output_tensor) - out_tensor_clones = [] - for i in range(n_outputs): - clone = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(clone) - out_tensor_clones += [clone.name] - - num_ch = int(out_shape[-1]) - vecs = out_shape[:-1] - - # create node with no parallelization first - pe = 1 - - dup_node = helper.make_node( - "DuplicateStreams_Batch", - [output_tensor], - out_tensor_clones, + # skip conversion if axis != 0 + if axis is not None and axis.i != 0: + continue + ind_name = node.input[1] + ind_dtype = model.get_tensor_datatype(ind_name) + emb_dtype = model.get_tensor_datatype(emb_name) + # skip conversion if inputs are not unsigned integers + if (not ind_dtype.is_integer()) or ind_dtype.signed(): + continue + num_embs, emb_dim = embs.shape + out_name = node.output[0] + ishape = model.get_tensor_shape(node.input[1]) + # create and insert new Lookup node + new_node = helper.make_node( + "Lookup", + [ind_name, emb_name], + [out_name], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - NumChannels=num_ch, - PE=pe, - inputDataType=dt.name, - numInputVectors=vecs, - NumOutputStreams=n_outputs, - outFIFODepths=[2] * n_outputs, - name="DuplicateStreams_Batch_" + node.name, + name="Lookup_" + node.name, + NumEmbeddings=num_embs, + EmbeddingDim=emb_dim, + EmbeddingType=emb_dtype.name, + InputType=ind_dtype.name, + InputShape=list(ishape), ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True - graph.node.insert(node_ind, dup_node) + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) - # connect successors to out tensor clone - clone_idx = 0 - for successor in successors: - for i, succ_input in enumerate(successor.input): - if succ_input == output_tensor: - successor.input[i] = out_tensor_clones[clone_idx] - clone_idx += 1 - # if one node has multiple connections to the same output - # find_direct_successors will return one node per input - # so break the inner loop will result in correct behaviour - break - - graph_modified = True - - if graph_modified: - model = model.transform(SortGraph()) - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferChannelwiseLinearLayer(Transformation): - """Convert any channel-wise Add/Mul into a HLS layer.""" - - def get_smallest_possible(self, vals): - """Returns smallest (fewest bits) possible DataType that can represent - value. Prefers unsigned integers where possible.""" - vals = np.array(vals, dtype=np.float64) - for v in vals: - assert int(v) == v, "Error float value" - - for k in DataType.get_accumulator_dt_cands(): - dt = DataType[k] - - if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: - # not currently supported - continue - - if (dt.min() <= vals).all() and (vals <= dt.max()).all(): - return dt - warnings.warn( - """InferChannelwiseLinearLayer: Output values may not be - representable with supported data types. - Setting maximum width data type available. - This will lead to errors if there are no constrains on the input - """ - ) - - if (0 <= vals).all(): - return DataType["UINT64"] - else: - return DataType["INT64"] +class InferConcatLayer(Transformation): + """Convert suitable Concat nodes (operating on last/-1 axis) + into StreamingConcat HW layers.""" def apply(self, model): graph = model.graph @@ -1309,167 +1140,44 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "Add" or node.op_type == "Mul": - # assuming input[0] is dynamic - ll_input = node.input[0] - ll_output = node.output[0] - ll_in_shape = model.get_tensor_shape(ll_input) - - # check if input 1 has an initializer - ll_const = node.input[1] - if ll_const is not None: - ll_cinit = model.get_initializer(ll_const) - if ll_cinit is None: - # input 1 is also dynamic - continue - else: - continue - - # get number of channels and channel index from input - ll_in_layout = model.get_tensor_layout(ll_input) - if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: - ch_index = -1 - ch = ll_in_shape[-1] - elif ll_in_layout == DataLayout.NCHW: - ch_index = 1 - ch = ll_in_shape[1] - else: + if node.op_type == "Concat": + ishape = model.get_tensor_shape(node.input[0]) + axis = get_by_name(node.attribute, "axis") + if (axis is None) or (ishape is None): continue - - # check if the shape of initializer is compatible - ll_cinit_shape = list(ll_cinit.shape) - if np.prod(ll_cinit_shape) == 1: - warnings.warn("Broadcasting " + str(node.op_type) + "(" + node.name + ")") - ll_cinit = np.full((ch), ll_cinit.flatten()[0]) - elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: - # parameter shape not compatible with Channelwise_batch + axis = axis.i + last_axis = len(ishape) - 1 + # skip conversion if not using last axis + if (axis != -1) and (axis != last_axis): continue - - # check initializer contains integers as floats - if not (ll_cinit.astype(np.int32) == ll_cinit).all(): + # check datatype coherence + dt0 = model.get_tensor_datatype(node.input[0]) + if dt0 is None: continue - # all initializer conditions are met - - # check inputs - idt = model.get_tensor_datatype(ll_input) - if not idt.is_integer(): - # skip conversion for layers with float input + dt_coherent = all([model.get_tensor_datatype(x) == dt0 for x in node.input]) + if not dt_coherent: continue - - # check layout of inputs/outputs, and convert if needed - # check layout and convert if necessary - if ll_in_layout == DataLayout.NCHW: - ll_input = nchw_to_nhwc(ll_input, model, node_ind) - node_ind += 1 - ll_in_shape = model.get_tensor_shape(ll_input) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - ll_output_layout = model.get_tensor_layout(ll_output) - if ll_output_layout == DataLayout.NCHW: - ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) - node_ind += 1 - - # get parameter data type - param_min = min(ll_cinit.flatten()) - param_max = max(ll_cinit.flatten()) - pdt = self.get_smallest_possible([param_min, param_max]) - - # set function and determine output data type - if node.op_type == "Add": - func = "add" - out_min = idt.min() + param_min - out_max = idt.max() + param_max - odt = self.get_smallest_possible([out_min, out_max]) - elif node.op_type == "Mul": - func = "mul" - possible_limits = [] - possible_limits += [idt.min() * param_min] - possible_limits += [idt.min() * param_max] - possible_limits += [idt.max() * param_min] - possible_limits += [idt.max() * param_max] - odt = self.get_smallest_possible(possible_limits) - - model.set_initializer(ll_const, ll_cinit.reshape(ch)) - model.set_tensor_datatype(ll_output, odt) - - # create node with no parallelization first - pe = 1 - assert ch % pe == 0, "Requirement IFC divisable by PE is violated." - # create and insert node - new_node = helper.make_node( - "ChannelwiseOp_Batch", - [ll_input, ll_const], - [ll_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - Func=func, - NumChannels=ch, - PE=pe, - inputDataType=idt.name, - paramDataType=pdt.name, - outputDataType=odt.name, - numInputVectors=list(ll_in_shape[:-1]), - name="ChannelwiseOp_Batch_" + node.name, - ) - graph.node.insert(insert_point, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferLabelSelectLayer(Transformation): - """Convert any TopK into a LabelSelect HLS layer.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "TopK": - fc_input = node.input[0] - k_input = node.input[1] - val_output = node.output[0] - idx_output = node.output[1] - fc_in_shape = model.get_tensor_shape(fc_input) - - idt = model.get_tensor_datatype(fc_input) - - # skip conversion for layers with float input - if not idt.is_integer(): + # skip conversion if any inputs are static + all_static = all([model.get_initializer(x) is None for x in node.input]) + if not all_static: continue - - # skip conversion for if value output is connected (not supported) - if model.find_consumer(val_output) is not None: + # skip conversion if inputs are not integers + if not dt0.is_integer(): continue - - num_labels = int(fc_in_shape[-1]) - num_inp_vecs = list(fc_in_shape[:-1]) - # create node with no parallelization first - pe = 1 - - k = model.get_initializer(k_input)[0] - - # create and insert new LabelSelect_Batch node + # ready for conversion + elems_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input] + inp_vec = list(model.get_tensor_shape(node.input[0])[:-1]) new_node = helper.make_node( - "LabelSelect_Batch", - [fc_input], - [idx_output], + "StreamingConcat", + node.input, + node.output, domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - Labels=num_labels, - PE=pe, - K=k, - inputDataType=idt.name, - numInputVectors=num_inp_vecs, - name="LabelSelect_Batch_" + node.name, + name="Concat_" + node.name, + ElemsPerStream=elems_per_stream, + inputDataType=dt0.name, + numInputVectors=inp_vec, + inFIFODepths=[2] * len(node.input), ) graph.node.insert(node_ind, new_node) # remove old node @@ -1482,8 +1190,9 @@ def apply(self, model): return (model, graph_modified) -class InferGlobalAccPoolLayer(Transformation): - """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul.""" +class InferStreamingEltwise(Transformation): + """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer + with SubEltwise or AbsDiffEltwise op.""" def apply(self, model): graph = model.graph @@ -1491,245 +1200,42 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "GlobalAveragePool": + if node.op_type == "Sub": in0 = node.input[0] + in1 = node.input[1] result = node.output[0] in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + in0_static = not (model.get_initializer(in0) is None) + in1_static = not (model.get_initializer(in1) is None) - idt = model.get_tensor_datatype(in0) + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + # skip if any of inputs have initializers + # (this node is meant for two dynamic streams) + if in0_static or in1_static: + continue + + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) # skip conversion for layers with float input - if not idt.is_integer(): + if not (idt0.is_integer() and idt1.is_integer()): continue + eltwiseOp = "Sub" + nodes_to_remove = [node] + # look for a downstream Abs node + res_consumer = model.find_consumer(result) + if (res_consumer is not None) and (res_consumer.op_type == "Abs"): + eltwiseOp = "AbsDiff" + result = res_consumer.output[0] + nodes_to_remove.append(res_consumer) + # check layout and convert if necessary in0_layout = model.get_tensor_layout(in0) - result_layout = model.get_tensor_layout(result) - - if in0_layout == DataLayout.NCHW: - in0 = nchw_to_nhwc(in0, model, node_ind) - node_ind += 1 - in0_shape = model.get_tensor_shape(in0) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - - if result_layout == DataLayout.NCHW: - result = nchw_to_nhwc(result, model, node_ind, reverse=True) - node_ind += 1 - - num_ch = int(in0_shape[-1]) - vecs = in0_shape[:-1] - # create node with no parallelization first - pe = 1 - - # create an additional tensor of the same shape and layout as result - out_shape = model.get_tensor_shape(result) - pool_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(pool_out) - pool_out = pool_out.name - model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) - - new_pool = helper.make_node( - "GlobalAccPool_Batch", - [in0], - [pool_out], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=num_ch, - PE=pe, - inputDataType=idt.name, - numInputVectors=vecs, - name="GlobalAccPool_Batch_" + node.name, - ) - - mul_value = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] - ) - model.graph.value_info.append(mul_value) - model.set_initializer( - mul_value.name, np.array(1 / (vecs[1] * vecs[2]), dtype=np.float32) - ) - new_mul = helper.make_node( - "Mul", - [pool_out, mul_value.name], - [result], - ) - graph.node.insert(insert_point, new_pool) - graph.node.insert(insert_point + 1, new_mul) - node_ind += 1 - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferLookupLayer(Transformation): - """Convert Gather nodes with constant op0 into Lookup HLS layers.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Gather": - emb_name = node.input[0] - embs = model.get_initializer(emb_name) - axis = get_by_name(node.attribute, "axis") - # skip conversion if input0 is not constant - if embs is None: - continue - # skip conversion if axis != 0 - if axis is not None and axis.i != 0: - continue - ind_name = node.input[1] - ind_dtype = model.get_tensor_datatype(ind_name) - emb_dtype = model.get_tensor_datatype(emb_name) - # skip conversion if inputs are not unsigned integers - if (not ind_dtype.is_integer()) or ind_dtype.signed(): - continue - num_embs, emb_dim = embs.shape - out_name = node.output[0] - ishape = model.get_tensor_shape(node.input[1]) - # create and insert new Lookup node - new_node = helper.make_node( - "Lookup", - [ind_name, emb_name], - [out_name], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - name="Lookup_" + node.name, - NumEmbeddings=num_embs, - EmbeddingDim=emb_dim, - EmbeddingType=emb_dtype.name, - InputType=ind_dtype.name, - InputShape=list(ishape), - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferConcatLayer(Transformation): - """Convert suitable Concat nodes (operating on last/-1 axis) - into StreamingConcat HLS layers.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Concat": - ishape = model.get_tensor_shape(node.input[0]) - axis = get_by_name(node.attribute, "axis") - if (axis is None) or (ishape is None): - continue - axis = axis.i - last_axis = len(ishape) - 1 - # skip conversion if not using last axis - if (axis != -1) and (axis != last_axis): - continue - # check datatype coherence - dt0 = model.get_tensor_datatype(node.input[0]) - if dt0 is None: - continue - dt_coherent = all([model.get_tensor_datatype(x) == dt0 for x in node.input]) - if not dt_coherent: - continue - # skip conversion if any inputs are static - all_static = all([model.get_initializer(x) is None for x in node.input]) - if not all_static: - continue - # skip conversion if inputs are not integers - if not dt0.is_integer(): - continue - # ready for conversion - elems_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input] - inp_vec = list(model.get_tensor_shape(node.input[0])[:-1]) - new_node = helper.make_node( - "StreamingConcat", - node.input, - node.output, - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - name="Concat_" + node.name, - ElemsPerStream=elems_per_stream, - inputDataType=dt0.name, - numInputVectors=inp_vec, - inFIFODepths=[2] * len(node.input), - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferStreamingEltwise(Transformation): - """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer - with SubEltwise or AbsDiffEltwise op.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Sub": - in0 = node.input[0] - in1 = node.input[1] - result = node.output[0] - in0_shape = model.get_tensor_shape(in0) - in1_shape = model.get_tensor_shape(in1) - in0_static = not (model.get_initializer(in0) is None) - in1_static = not (model.get_initializer(in1) is None) - - # skip if different shapes on inputs - if in0_shape != in1_shape: - continue - # skip if any of inputs have initializers - # (this node is meant for two dynamic streams) - if in0_static or in1_static: - continue - - idt0 = model.get_tensor_datatype(in0) - idt1 = model.get_tensor_datatype(in1) - - # skip conversion for layers with float input - if not (idt0.is_integer() and idt1.is_integer()): - continue - - eltwiseOp = "Sub" - nodes_to_remove = [node] - # look for a downstream Abs node - res_consumer = model.find_consumer(result) - if (res_consumer is not None) and (res_consumer.op_type == "Abs"): - eltwiseOp = "AbsDiff" - result = res_consumer.output[0] - nodes_to_remove.append(res_consumer) - - # check layout and convert if necessary - in0_layout = model.get_tensor_layout(in0) - in1_layout = model.get_tensor_layout(in1) + in1_layout = model.get_tensor_layout(in1) result_layout = model.get_tensor_layout(result) if in0_layout == DataLayout.NCHW: @@ -1742,7 +1248,7 @@ def apply(self, model): node_ind += 1 in1_shape = model.get_tensor_shape(in1) - # keep track of where we need to insert the HLS Op + # keep track of where we need to insert the HW Op # it has to be ahead of the output transform insert_point = node_ind @@ -1776,7 +1282,412 @@ def apply(self, model): graph.node.remove(nd) graph_modified = True - # if graph_modified: - # model = model.transform(InferShapes()) - # model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferBinaryMatrixVectorActivation(Transformation): + """Convert XnorPopcountMatMul layers to + MatrixVectorActivation layers. Any immediately following MultiThreshold + layers will also be absorbed into the MVTU.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "XnorPopcountMatMul": + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( + n.name + + """: First + input for xnorpopcount is not Wset to FINN DataType BINARY.""" + ) + assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( + n.name + + """: Second + input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" + ) + idt = DataType["BINARY"] + wdt = DataType["BINARY"] + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisiable by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + if odt.bitwidth() == 1: + # covers both bipolar and binary + actval = 0 + else: + actval = odt.min() + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=1, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=1, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferQuantizedMatrixVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + MatrixVectorActivation layers.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisible by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + odt_is_bipolar = odt == DataType["BIPOLAR"] + bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1) + assert scale == 1.0 or bipolar_ok, ( + consumer.name + ": out_scale=1 or bipolar output needed for conversion." + ) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + if bipolar_ok: + # remove bias for bipolar, since + # binary->bipolar is achieved by reinterpretation + actval = 0 + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=0, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + name="MVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=0, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + name="MVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferVectorVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + VectorVectorActivation layers, if the sparsity annotation + of the weight matrix indicates that the MatMul layer belongs to + a depthwise convolution. Any immediately following MultiThreshold + layers will also be absorbed into the VVAU.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None: + sparsity = model.get_tensor_sparsity(n.input[1]) + try: + k_h, k_w = sparsity["dw"]["kernel_shape"] + except KeyError: + raise Exception( + n.name + + """: sparsity annotation doesn't indicate that MatMul + belongs to a depthwise convolution.""" + ) + + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # infer dense weight tensor from sparse weight matrix + # kernel size (k_h, k_w) which was extracted above and the value of + # the channels is used. + # the weight matrix has a shape of (k_h * k_w * Channels, Channels) + # we need to reverse the creation of the sparse weight matrix + # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) + channels = int(W.shape[1]) + # transpose to achieve a shape of (k_h * k_w * Channels, Channels) + W = W.T + # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards + # to (Channels, Channels, k_h, k_w) + W = W.reshape(channels, k_h, k_w, channels) + W = W.transpose(0, 3, 1, 2) + # now we can extract the values using a for loop over the channels + # and fill a zero numpy array in the correct shape + w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) + for ch in range(channels): + w_tensor[ch][0] = W[ch][ch] + model.set_initializer(mm_weight, w_tensor) + model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) + # create node with pe=channels as default + pe = channels + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # create VVAU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == channels, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor Channels.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + assert scale == 1.0, ( + consumer.name + ": out_scale must be equal to 1.0 for HLS conversion." + ) + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new VectorVectorActivation node + new_node = helper.make_node( + "VVAU", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + noActivation=0, + name="VVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new VVAU node + new_node = helper.make_node( + "VVAU", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + noActivation=1, + name="VVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py index 07d6961be3..f34c6b90af 100644 --- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py +++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py @@ -52,7 +52,7 @@ def __init__(self, partition_model_dir=None): def apply(self, model): def filter_fc_extw(x): - if x.op_type == "IODMA": + if x.op_type == "IODMA_hls": burst_mode = get_by_name(x.attribute, "burstMode") if burst_mode is not None: burst_mode = burst_mode.s.decode("UTF-8") diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 9a653fe404..4212e2b58a 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,7 +41,7 @@ ReplaceVerilogRelPaths, ) from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def is_external_input(model, node, i): @@ -48,12 +49,13 @@ def is_external_input(model, node, i): # True only if input is unconnected and has no initializer # Only esception is second input of FC layers when mem_mode is external node_inst = getCustomOp(node) + op_type = node.op_type producer = model.find_producer(node.input[i]) if producer is None: if model.get_initializer(node.input[i]) is None: return True else: - if node.op_type == "MatrixVectorActivation": + if op_type.startswith("MVAU"): if node_inst.get_nodeattr("mem_mode") == "external": return True return False @@ -284,14 +286,14 @@ def apply(self, model): ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream") if self.signature: ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info") - if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]: + if model.graph.node[0].op_type not in ["StreamingFIFO_rtl", "IODMA_hls"]: warnings.warn( """First node is not StreamingFIFO or IODMA. You may experience incorrect stitched-IP rtlsim or hardware behavior. It is strongly recommended to insert FIFOs prior to calling CreateStitchedIP.""" ) - if model.graph.node[0].op_type == "StreamingFIFO": + if model.graph.node[0].op_type == "StreamingFIFO_rtl": firstfifo = getCustomOp(model.graph.node[0]) if firstfifo.get_nodeattr("impl_style") == "vivado": warnings.warn( @@ -301,7 +303,9 @@ def apply(self, model): ) for node in model.graph.node: # ensure that all nodes are fpgadataflow, and that IPs are generated - assert is_fpgadataflow_node(node), "All nodes must be FINN fpgadataflow nodes." + assert is_hls_node(node) or is_rtl_node( + node + ), "All nodes must be FINN fpgadataflow nodes." node_inst = getCustomOp(node) ip_dir_value = node_inst.get_nodeattr("ip_path") assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist." @@ -348,7 +352,7 @@ def apply(self, model): if self.signature: # extract number of checksum layer from graph - checksum_layers = model.get_nodes_by_op_type("checksum") + checksum_layers = model.get_nodes_by_op_type("CheckSum_hls") self.insert_signature(len(checksum_layers)) # create a temporary folder for the project diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py index dc660f5fba..4d3ac7dc67 100644 --- a/src/finn/transformation/fpgadataflow/derive_characteristic.py +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -1,4 +1,5 @@ -# Copyright (c) 2022, Xilinx +# Copyright (C) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class DeriveCharacteristic(NodeLocalTransformation): @@ -58,7 +59,7 @@ def __init__(self, period, num_workers=None, manual_bypass=False): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) @@ -74,7 +75,7 @@ def apply(self, model: ModelWrapper): return (model, run_again) # apply manual fix for DuplicateStreams and AddStreams for # simple residual reconvergent paths with bypass - addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + addstrm_nodes = model.get_nodes_by_op_type("AddStreams_hls") for addstrm_node in addstrm_nodes: # we currently only support the case where one branch is # a bypass @@ -83,8 +84,8 @@ def apply(self, model: ModelWrapper): if (b0 is None) or (b1 is None): warnings.warn("Found unsupported AddStreams, skipping") return (model, run_again) - b0_is_bypass = b0.op_type == "DuplicateStreams_Batch" - b1_is_bypass = b1.op_type == "DuplicateStreams_Batch" + b0_is_bypass = b0.op_type == "DuplicateStreams_hls" + b1_is_bypass = b1.op_type == "DuplicateStreams_hls" if (not b0_is_bypass) and (not b1_is_bypass): warnings.warn("Found unsupported AddStreams, skipping") return (model, run_again) @@ -130,11 +131,11 @@ def __init__(self, num_workers=None, io_fifo_depth=32): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps prod = registry.getCustomOp(node) - assert op_type != "StreamingFIFO", "Found existing FIFOs" + assert not (op_type.startswith("StreamingFIFO")), "Found existing FIFOs" period = prod.get_nodeattr("io_chrc_period") prod_chrc = prod.get_nodeattr("io_chrc_out")[0] assert len(prod_chrc) == 2 * period, "Found unexpected characterization attribute" diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py index 633db0c553..5e21d8cb2a 100644 --- a/src/finn/transformation/fpgadataflow/externalize_params.py +++ b/src/finn/transformation/fpgadataflow/externalize_params.py @@ -42,7 +42,7 @@ def apply(self, model): graph_modified = False def filter_fc_extw(x): - if x.op_type == "IODMA": + if x.op_type == "IODMA_hls": burst_mode = get_by_name(x.attribute, "burstMode") if burst_mode is not None: burst_mode = burst_mode.s.decode("UTF-8") diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 336b3f80d0..b24145afcb 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -81,7 +81,7 @@ def apply(self, model): if node_slr == -1: unassigned_nodes += 1 node_inst.set_nodeattr("slr", default_slr) - if node.op_type == "StreamingDataWidthConverter_Batch": + if node.op_type.startswith("StreamingDataWidthConverter"): # if we have SLR assignment already. use that if node_slr != -1: continue @@ -95,7 +95,7 @@ def apply(self, model): narrow_neighbour = model.find_producer(node.input[0]) node_slr = getCustomOp(narrow_neighbour).get_nodeattr("slr") node_inst.set_nodeattr("slr", node_slr) - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): # if we have SLR assignment already. use that if node_slr != -1: continue @@ -119,11 +119,11 @@ def apply(self, model): df_nodes = list( filter(lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes) ) - dma_nodes = list(filter(lambda x: x.op_type == "IODMA", df_nodes)) + dma_nodes = list(filter(lambda x: x.op_type == "IODMA_hls", df_nodes)) non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes)) dyn_tlastmarker_nodes = list( filter( - lambda x: x.op_type == "TLastMarker" + lambda x: x.op_type == "TLastMarker_hls" and getCustomOp(x).get_nodeattr("DynIters") == "true", non_dma_nodes, ) @@ -150,7 +150,7 @@ def apply(self, model): continue elif not ( - node.op_type == "MatrixVectorActivation" + node.op_type.startswith("MVAU") and node_inst.get_nodeattr("mem_mode") is not None and node_inst.get_nodeattr("mem_mode") == "external" ): diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py index 08069fa00f..5b901d9284 100644 --- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py +++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,18 +32,18 @@ import warnings from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node class HLSSynthIP(NodeLocalTransformation): - """For each node: generate IP block from code in folder + """For each HLS node: generate IP block from code in folder that is referenced in node attribute "code_gen_dir_ipgen" and save path of generated project in node attribute "ipgen_path". All nodes in the graph must have the fpgadataflow backend attribute. Any nodes that already have a ipgen_path attribute pointing to a valid path will be skipped. - This transformation calls Vivado HLS for synthesis, so it will run for + This transformation calls Vitis HLS for synthesis, so it will run for some time (minutes to hours depending on configuration). * num_workers (int or None) number of parallel workers, see documentation in @@ -54,7 +55,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index bf0254c1a7..33cc3e86d3 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -1,4 +1,31 @@ -import warnings +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from onnx import TensorProto from onnx import helper as oh from qonnx.custom_op.registry import getCustomOp @@ -8,19 +35,16 @@ def _is_dwc_node(node): - if node.op_type == "StreamingDataWidthConverter_Batch": - return True - else: - return False + return node.op_type.startswith("StreamingDataWidthConverter") def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): if _is_dwc_node(node): # no DWC for DWCs return False - elif node.op_type == "IODMA": + elif node.op_type == "IODMA_hls": # IODMA data shapes/widths need special handling return False else: @@ -34,9 +58,8 @@ def _suitable_node(node): class InsertDWC(Transformation): """Add data width converters between layers where necessary.""" - def __init__(self, use_rtl_variant=True): + def __init__(self): super().__init__() - self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -50,7 +73,7 @@ def apply(self, model): if consumers == []: continue assert len(consumers) == 1, ( - n.name + ": HLS node with fan-out higher than 1 cannot be stitched" + n.name + ": HW node with fan-out higher than 1 cannot be stitched" ) consumer = consumers[0] if _suitable_node(consumer) is True: @@ -62,9 +85,9 @@ def apply(self, model): # - if FC and external mem, it could be connected to input 1 # - if concat, could be connected to any input if ( - consumer.op_type == "MatrixVectorActivation" + consumer.op_type.startswith("MVAU") and n1.get_nodeattr("mem_mode") == "external" - ) or (consumer.op_type == "StreamingConcat"): + ) or (consumer.op_type.startswith("StreamingConcat")): # get input idx in_idx = None for idx, n_input in enumerate(consumer.input): @@ -82,20 +105,7 @@ def apply(self, model): dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() - if self.use_rtl_variant: - # check if rtl variant can be used - iwidth_d = dwc_in_width % dwc_out_width == 0 - owidth_d = dwc_out_width % dwc_in_width == 0 - if iwidth_d or owidth_d: - node_optype = "StreamingDataWidthConverter_rtl" - else: - warnings.warn( - "DWC cannot be implemented as RTL variant, default to hls" - ) - node_optype = "StreamingDataWidthConverter_Batch" - self.use_rtl_variant = False - else: - node_optype = "StreamingDataWidthConverter_Batch" + node_optype = "StreamingDataWidthConverter" # determine shape for dwc dwc_shape = n0.get_normal_output_shape() @@ -121,15 +131,6 @@ def apply(self, model): outWidth=dwc_out_width, dataType=str(dtype.name), ) - # if not rtl variant is selected - # use hls mode by default since it supports more configs - # vivado mode can be manually enabled by user, but does not - # support e.g. node-by-node rtlsim neded for - # characterization-based FIFO sizing - if not self.use_rtl_variant: - impl_attr = oh.make_attribute("impl_style", "hls") - dwc_node.attribute.append(impl_attr) - # insert dwc graph.node.insert(node_ind + 1, dwc_node) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index f57c9e41b7..9df193efcf 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +38,7 @@ def _is_fifo_node(node): - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): return True else: return False @@ -45,8 +46,8 @@ def _is_fifo_node(node): def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: - if _is_fifo_node(node) is False: + if is_fpgadataflow_node(node): + if not _is_fifo_node(node): return True else: return False @@ -127,6 +128,7 @@ def apply(self, model): folded output shape of the first node is not the same as the folded output shape of the second node. A streaming fifo can't be implemented in between these nodes.""" + n_shape = n0.get_normal_output_shape() # check if outFIFOdepths attribute of first node # and inFIFOdepths attribute of consumer node is equal @@ -162,6 +164,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, @@ -182,12 +185,16 @@ def apply(self, model): for graph_in_name in graph_in_names: first_node = model.find_consumer(graph_in_name) # insert FIFO as first node, except when first node is DMA - if first_node.op_type != "StreamingFIFO" and first_node.op_type != "IODMA": + if ( + not first_node.op_type.startswith("StreamingFIFO") + and first_node.op_type != "IODMA_hls" + ): inp_ind = list(first_node.input).index(graph_in_name) n_input = first_node.input[inp_ind] n0 = getCustomOp(first_node) # determine fifo node attributes fld_shape = n0.get_folded_input_shape(inp_ind) + n_shape = n0.get_normal_input_shape(inp_ind) dtype = n0.get_input_datatype(inp_ind) fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind] @@ -196,7 +203,7 @@ def apply(self, model): fifo_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, - n0.get_normal_input_shape(), + n0.get_normal_input_shape(inp_ind), ) graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) @@ -213,6 +220,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, @@ -234,15 +242,19 @@ def apply(self, model): graph_out_names = [x.name for x in model.graph.output] for graph_out_name in graph_out_names: final_node = model.find_producer(graph_out_name) - if final_node.op_type != "StreamingFIFO" and final_node.op_type != "IODMA": + if ( + not final_node.op_type.startswith("StreamingFIFO") + and final_node.op_type != "IODMA_hls" + ): assert ( - final_node.op_type != "TLastMarker" + final_node.op_type != "TLastMarker_hls" ), """Insert tlast marker should be done after inserting the FIFOs""" n0 = getCustomOp(final_node) out_ind = list(final_node.output).index(graph_out_name) # determine fifo node attributes fld_shape = n0.get_folded_output_shape(out_ind) + n_shape = n0.get_normal_output_shape(out_ind) dtype = n0.get_output_datatype(out_ind) fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind] @@ -268,6 +280,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, diff --git a/src/finn/transformation/fpgadataflow/insert_hook.py b/src/finn/transformation/fpgadataflow/insert_hook.py index 14989efa75..843a32a73e 100644 --- a/src/finn/transformation/fpgadataflow/insert_hook.py +++ b/src/finn/transformation/fpgadataflow/insert_hook.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,11 +34,11 @@ from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def _is_hook_node(node): - if node.op_type in ["CheckSum"]: + if node.op_type in ["CheckSum_hls"]: return True else: return False @@ -45,8 +46,8 @@ def _is_hook_node(node): def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: - if _is_hook_node(node) is False: + if is_hls_node(node) or is_rtl_node(node): + if not _is_hook_node(node): return True else: return False @@ -81,7 +82,7 @@ def apply(self, model): if n0_hook in list_supported_hooks: if n0_hook == "checksum": if len(consumers) == 1: - if consumers[0].op_type == "CheckSum": + if consumers[0].op_type == "CheckSum_hls": continue n0_normal_oshape = n0.get_normal_output_shape() n0_folded_oshape = n0.get_folded_output_shape() @@ -99,10 +100,10 @@ def apply(self, model): [1], ) chk_node = oh.make_node( - "CheckSum", + "CheckSum_hls", [output_name], outputs=[chk_otensor.name, chk_result.name], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", words_per_frame=words_per_frame, items_per_word=items_per_word, diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 90700d5726..91d4ab1559 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -106,7 +106,7 @@ def apply(self, model): graph_in_names = [x.name for x in model.graph.input] for graph_in_name in graph_in_names: first_node = model.find_consumer(graph_in_name) - if first_node.op_type == "IODMA": + if first_node.op_type == "IODMA_hls": # IODMA already inserted for this input continue else: @@ -134,7 +134,7 @@ def apply(self, model): # padding problems for i/o DMA first_node.input[0] = first_node_in.name dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [graph_in_name], [first_node_in.name], numInputVectors=in_folded_shape[:-1], @@ -143,7 +143,7 @@ def apply(self, model): intfWidth=intfwidth, streamWidth=padded_instream_width, direction="in", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.insert(0, dma_node) @@ -153,7 +153,7 @@ def apply(self, model): graph_out_names = [x.name for x in model.graph.output] for graph_out_name in graph_out_names: final_node = model.find_producer(graph_out_name) - if final_node.op_type == "IODMA": + if final_node.op_type == "IODMA_hls": continue else: out_shape = model.get_tensor_shape(graph_out_name) @@ -180,7 +180,7 @@ def apply(self, model): # FIXME: currently always using 8-bit dtypes to work around the # padding problems for i/o DMA dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [final_node_out.name], [graph_out_name], numInputVectors=out_folded_shape[:-1], @@ -189,7 +189,7 @@ def apply(self, model): intfWidth=intfwidth, streamWidth=padded_outstream_width, direction="out", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.append(dma_node) @@ -199,7 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type in ["MatrixVectorActivation", "VectorVectorActivation"] + lambda x: x.op_type in ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, @@ -230,7 +230,7 @@ def apply(self, model): model.set_tensor_datatype(fc_node_in.name, w_dtype) model.set_initializer(fc_node_in.name, W) dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [fc_w_name], [fc_node_in.name], numInputVectors=[iodma_mem.shape[0]], @@ -240,7 +240,7 @@ def apply(self, model): streamWidth=streamWidth, direction="in", burstMode="wrap", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) fc_node.input[1] = fc_node_in.name diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 94f0b0eae1..2131100dcf 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -35,7 +35,7 @@ class InsertTLastMarker(Transformation): - """Ensure that the graph is started/terminated with a TLastMarker node, inserting + """Ensure that the graph is started/terminated with a TLastMarker_hls node, inserting one if necessary. Use constructor args to determine type of TLastMarker to be inserted. More information available on the TLastMarker documentation. @@ -52,8 +52,8 @@ def apply(self, model): graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) graph_modified = False - if final_node.op_type != "TLastMarker" and not ( - final_node.op_type == "IODMA" + if final_node.op_type != "TLastMarker_hls" and not ( + final_node.op_type == "IODMA_hls" and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") == "out" ): custom_op = getCustomOp(final_node) @@ -71,7 +71,7 @@ def apply(self, model): # reroute final node output to final_node_out_name final_node.output[0] = final_node_out.name tlast_node = oh.make_node( - "TLastMarker", + "TLastMarker_hls", [final_node_out.name], [graph_out_name], NumIters=num_iters, @@ -80,7 +80,7 @@ def apply(self, model): DynIters=(1 if self.dyniters else 0), Direction="out", Protocol=("external" if self.external else "internal"), - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.append(tlast_node) @@ -103,23 +103,23 @@ def apply(self, model): # the input is in the list of graph inputs because it has an # initializer (TODO: fix this with a clean-up transform) if ( - first_node.op_type == "MatrixVectorActivation" + first_node.op_type.startswith("MVAU") and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") != "external" ): continue # 2. node is either a TLastMarker or an input IODMA - if first_node.op_type != "TLastMarker" and not ( - first_node.op_type == "IODMA" + if first_node.op_type != "TLastMarker_hls" and not ( + first_node.op_type == "IODMA_hls" and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") == "in" ): custom_op = getCustomOp(first_node) num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) inp_idx = list(first_node.input).index(graph_in_name) if inp_idx > 0: - if first_node.op_type == "MatrixVectorActivation" and inp_idx == 1: + if first_node.op_type.startswith("MVAU") and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) - elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: + elif first_node.op_type.startswith("AddStreams") and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) else: raise Exception("No method to determine stream width") @@ -141,7 +141,7 @@ def apply(self, model): # reroute final node output to first_node_in_name first_node.input[inp_idx] = first_node_in.name tlast_node = oh.make_node( - "TLastMarker", + "TLastMarker_hls", [graph_in_name], [first_node_in.name], NumIters=num_iters, @@ -150,7 +150,7 @@ def apply(self, model): DynIters=(1 if self.dyniters else 0), Direction="in", Protocol=("external" if self.external else "internal"), - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.insert(insert_idx, tlast_node) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 6d1fa290b4..ea9bd2aa26 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -146,7 +146,7 @@ def apply(self, model): Ensure CreateDataflowPartition called before driver creation.""" first_df_model = ModelWrapper(getCustomOp(i_consumer).get_nodeattr("model")) assert ( - first_df_model.graph.node[0].op_type == "IODMA" + first_df_model.graph.node[0].op_type == "IODMA_hls" ), "First partition must hold input IODMA" successors = model.find_direct_successors(i_consumer) successor_input_num = list(successors[0].input).index(i_consumer.output[0]) @@ -187,7 +187,9 @@ def apply(self, model): ), """ Ensure CreateDataflowPartition called before driver creation.""" df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model")) - assert df_model.graph.node[-1].op_type == "IODMA", "Partition must hold output IODMA" + assert ( + df_model.graph.node[-1].op_type == "IODMA_hls" + ), "Partition must hold output IODMA" predecessors = model.find_direct_predecessors(o_producer) predecessor_output_num = list(predecessors[0].output).index(o_producer.input[0]) predecessor_sdp = getCustomOp(predecessors[0]) @@ -231,7 +233,7 @@ def apply(self, model): sdp_inst = getCustomOp(node) idma_name = sdp_inst.get_nodeattr("instance_name") df_model = ModelWrapper(sdp_inst.get_nodeattr("model")) - assert df_model.graph.node[0].op_type == "IODMA" + assert df_model.graph.node[0].op_type == "IODMA_hls" iodma_node = getCustomOp(df_model.graph.node[0]) if iodma_node.get_nodeattr("burstMode") == "wrap": # input weights dma? init_tensor = df_model.get_initializer(iodma_node.onnx_node.input[0]) @@ -280,7 +282,7 @@ def apply(self, model): dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if node.op_type.startswith("MVAU") or node.op_type.startswith("Thresholding"): node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 989eb62a88..fc2047b08e 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +46,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map from . import templates @@ -62,8 +64,8 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: - if node_inst.get_nodeattr("mem_mode") == "decoupled": + if node.op_type.startswith("MVAU") or node.op_type == "Thresholding_hls": + if node_inst.get_nodeattr("mem_mode") == "internal_decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] if need_memstreamer: @@ -320,6 +322,7 @@ def apply(self, model): prep_transforms = [ InsertIODMA(self.axi_port_width), InsertDWC(), + SpecializeLayers(), Floorplan(), CreateDataflowPartition(partition_model_dir=self.partition_model_dir), ] @@ -335,6 +338,7 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform(SpecializeLayers()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns)) diff --git a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py index 8d04d5b817..61159fde0c 100644 --- a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py +++ b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,7 +47,7 @@ def apply(self, model): # Since InferDataTypes potentially changes node attributes in each loop iterations, # the for-loop cannot loop over a list of a snapshot of the graph's node protos node = model.graph.node[node_id] - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): inst = getCustomOp(node) if hasattr(inst, "minimize_accumulator_width"): inst.minimize_accumulator_width(model) diff --git a/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py index 32871cc44a..49770f7d0c 100644 --- a/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py +++ b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,7 +42,7 @@ def __init__(self): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): inst = getCustomOp(node) if hasattr(inst, "minimize_weight_bit_width"): inst.minimize_weight_bit_width(model) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index 76c3f88310..d4cc6dcc99 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from qonnx.util.basic import get_num_default_workers from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node def _codegen_single_node(node, model): @@ -78,7 +79,7 @@ def __init__(self, num_workers=None): self._num_workers = mp.cpu_count() def prepareCppSim_node(self, node): - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): _codegen_single_node(node, self.model) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py index 5461bbd77c..a74e0f7afc 100644 --- a/src/finn/transformation/fpgadataflow/prepare_ip.py +++ b/src/finn/transformation/fpgadataflow/prepare_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from qonnx.transformation.base import Transformation from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def _codegen_single_node(node, model, fpgapart, clk): @@ -72,8 +73,15 @@ class PrepareIP(Transformation): will be skipped. Outcome if succesful: Node attribute "code_gen_dir_ipgen" contains path to folder - that contains generated C++ code that can be used to generate a Vivado IP block. - The subsequent transformation is HLSSynthIP""" + that contains: + + * For HLS layers: generated C++ code that can be used to generate a Vivado IP block. + The necessary subsequent transformation is HLSSynthIP. + + * For RTL layers: filled template verilog files that can be used to instantiate as + module during IP stitching. + + """ def __init__(self, fpgapart, clk): super().__init__() @@ -82,6 +90,6 @@ def __init__(self, fpgapart, clk): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): _codegen_single_node(node, model, self.fpgapart, self.clk) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py index 8ba7cfd965..b8f45deb1d 100644 --- a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node try: from pyverilator import PyVerilator @@ -63,7 +64,7 @@ def apply(self, model): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py index 4e7970caa0..de13166e73 100644 --- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py +++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,7 +31,7 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import Transformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class ReplaceVerilogRelPaths(Transformation): @@ -41,7 +42,7 @@ def __init__(self): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py index 8488b4ef83..405ddb0c42 100644 --- a/src/finn/transformation/fpgadataflow/set_exec_mode.py +++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,12 +30,15 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import Transformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class SetExecMode(Transformation): """Set attribute exec_mode in all fpgadataflow nodes to specify which - kind of execution should be used ("cppsim" or "rtlsim")""" + kind of execution should be used ("cppsim" or "rtlsim"). + Note that RTL components do not support cppsim. When cppsim is selected + for RTL components, by default the execution of the HW op parent is + executed.""" def __init__(self, mode): super().__init__() @@ -43,7 +47,7 @@ def __init__(self, mode): def apply(self, model): for node in model.graph.node: op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 72b5e495a4..82ee536d50 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -47,7 +48,8 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_hls_node, is_rtl_node from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim @@ -101,7 +103,7 @@ def apply(self, model): else: is_first_node = True if ( - node.op_type == "StreamingFIFO" + node.op_type.startswith("StreamingFIFO") and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold and (not is_first_node) ): @@ -165,16 +167,16 @@ def apply(self, model): for node in model.graph.node: # look for following pattern: # ConvolutionInputGenerator -> StreamingFIFO -> MatrixVectorActivation - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): fifo_prod = model.find_producer(node.input[0]) fifo_cons = model.find_consumer(node.output[0]) if fifo_prod is None: continue - if fifo_prod.op_type != "ConvolutionInputGenerator": + if not fifo_prod.op_type.startswith("ConvolutionInputGenerator"): continue if fifo_cons is None: continue - if fifo_cons.op_type != "MatrixVectorActivation": + if not fifo_cons.op_type.startswith("MVAU"): continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") @@ -257,14 +259,17 @@ def __init__( def apply(self, model): # these optypes may potentially use external weights # we'll temporarily change them to use decoupled mode for FIFO sizing - extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"] + extw_optypes = ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] for node in model.graph.node: # verify assumptions - assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str(node) - assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" + assert is_hls_node(node) or is_rtl_node(node), "Found non-fpgadataflow node: " + str( + node + ) + op_type = node.op_type + assert not op_type.startswith("StreamingFIFO"), "Found existing StreamingFIFO node" node = getCustomOp(node) ifd = node.get_nodeattr("inFIFODepths") ofd = node.get_nodeattr("outFIFODepths") @@ -280,26 +285,27 @@ def apply(self, model): ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1]) node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) - node.set_nodeattr("mem_mode", "decoupled") + node.set_nodeattr("mem_mode", "internal_decoupled") reset_implementation(node) warnings.warn( - "Changed mem_mode from external to decoupled for " + node.onnx_node.name + "Changed mem_mode from external to internal_decoupled for " + + node.onnx_node.name ) # insert stream infrastructure (DWC/FIFO) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers(self.fpgapart)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # gather FIFO names, check they are of expected depth fifos = {} - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl") for node in fifo_nodes: fifos[node.name] = 0 node = getCustomOp(node) @@ -373,7 +379,9 @@ def apply(self, model): else: # do rtlsim in C++ for FIFO sizing # determine # inputs for FIFO sizing according to topology type - swg_nodes = [x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type] + swg_nodes = [ + x for x in model.graph.node if x.op_type.startswith("ConvolutionInputGenerator") + ] if len(swg_nodes) == 0: # MLP, no layer overlap # assuming half the nodes are now FIFOs, use half the # of @@ -397,7 +405,7 @@ def apply(self, model): for node in model.graph.node: # set FIFO depth, reset FIFO implementation, # and set implementation/ram styles - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): assert node.name in fifos, "FIFO node not found in size dictionary" # set depth of FIFO depth = optimize_depth(fifos[node.name]) @@ -441,7 +449,7 @@ def apply(self, model): # reflect final values in attributes for node in model.graph.node: - if node.op_type != "StreamingFIFO": + if not node.op_type.startswith("StreamingFIFO"): node_inst = getCustomOp(node) fifodepth_in = [] for node_inp in node.input: @@ -456,7 +464,7 @@ def apply(self, model): pass else: # there is a producer for this input - if prod.op_type == "StreamingFIFO": + if prod.op_type.startswith("StreamingFIFO"): prod_inst = getCustomOp(prod) fifodepth_in.append(prod_inst.get_nodeattr("depth")) else: @@ -475,7 +483,7 @@ def apply(self, model): pass else: # there is a consumer for this input - if cons.op_type == "StreamingFIFO": + if cons.op_type.startswith("StreamingFIFO"): cons_inst = getCustomOp(cons) fifodepth_out.append(cons_inst.get_nodeattr("depth")) else: @@ -562,12 +570,13 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "StreamingFIFO": + if node.op_type == ("StreamingFIFO_rtl"): n_inst = getCustomOp(node) depth = n_inst.get_nodeattr("depth") cfgs = get_fifo_split_configs(depth, self.max_qsrl_depth, self.max_vivado_depth) if len(cfgs) > 1: fld_shape = n_inst.get_folded_output_shape() + n_shape = n_inst.get_normal_output_shape() dtype = n_inst.get_nodeattr("dataType") ram_style = n_inst.get_nodeattr("ram_style") shape = model.get_tensor_shape(node.input[0]) @@ -586,13 +595,14 @@ def apply(self, model): graph.value_info.append(out_tensor) model.set_tensor_datatype(out_tensor.name, DataType[dtype]) fifo_node = helper.make_node( - "StreamingFIFO", + "StreamingFIFO_rtl", [inp], [outp], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.rtl", backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=dtype, impl_style=impl_style, ram_style=ram_style, diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 4045a28e16..eaee499e6a 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def divisors(num): @@ -99,33 +100,33 @@ def apply(self, model): graph = model.graph # these ops use PE parallelism, up to a max value of NumChannels pe_ops = [ - "AddStreams_Batch", - "ChannelwiseOp_Batch", - "DuplicateStreams_Batch", - "GlobalAccPool_Batch", - "Thresholding_Batch", + "AddStreams_hls", + "ChannelwiseOp_hls", + "DuplicateStreams_hls", + "GlobalAccPool_hls", + "Thresholding_hls", + "Thresholding_rtl", ] # these ops use SIMD parallelism, up to a max value of NumChannels # ConvolutionInputGenerator* has a special case when depthwise=1 # ConvolutionInputGenerator_rtl supports additional parallelism by # setting parallel_window=1 mode after maxing out SIMD simd_ops = [ - "DownSampler", - "FMPadding_Batch", - "FMPadding_Pixel", - "ConvolutionInputGenerator", - "ConvolutionInputGenerator1D", + "DownSampler_hls", + "FMPadding_hls", + "FMPadding_Pixel_hls", + "ConvolutionInputGenerator_hls", "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring - depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"] + depthwise_op_exceptions = ["VVAU_hls", "VVAU_rtl", "Pool_hls"] for node in graph.node: - if not is_fpgadataflow_node(node): + if not (is_hls_node(node) or is_rtl_node(node)): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type == "MatrixVectorActivation": + if op_type in ["MVAU_hls", "MVAU_rtl"]: max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) @@ -152,12 +153,12 @@ def apply(self, model): elif op_type in pe_ops: max_pe = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_pe, "PE") - elif op_type == "LabelSelect_Batch": + elif op_type == "LabelSelect_hls": max_pe = node_inst.get_nodeattr("Labels") self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: # init/reset SIMD of VVAU - if op_type == "VectorVectorActivation": + if op_type in ["VVAU_hls", "VVAU_rtl"]: node_inst.set_nodeattr("SIMD", 1) max_pe = node_inst.get_nodeattr("Channels") self.optimize_attribute_val(node_inst, max_pe, "PE") @@ -165,7 +166,7 @@ def apply(self, model): pe = node_inst.get_nodeattr("PE") cyc = node_inst.get_exp_cycles() if ( - op_type == "VectorVectorActivation" + op_type in ["VVAU_hls", "VVAU_rtl"] and pe == max_pe and cyc > self.target_cycles_per_frame ): @@ -179,17 +180,14 @@ def apply(self, model): swu_node_inst.set_nodeattr("SIMD", pe) # enable parallel_window mode of RTL SWG if needed if swu_node.op_type == "ConvolutionInputGenerator_rtl": - if ( - op_type == "VectorVectorActivation" - and node_inst.get_nodeattr("SIMD") > 1 - ): + if op_type.startswith("VVAU") and node_inst.get_nodeattr("SIMD") > 1: swu_node_inst.set_nodeattr("parallel_window", 1) else: swu_node_inst.set_nodeattr("parallel_window", 0) else: - if op_type == "VectorVectorActivation": + if op_type in ["VVAU_hls", "VVAU_rtl"]: ksize = np.prod(node_inst.get_nodeattr("Kernel")) - elif op_type == "Pool_Batch": + elif op_type == "Pool_hls": ksize = node_inst.get_nodeattr("KernelSize") else: raise Exception("Undefined edge case for %s" % op_type) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py new file mode 100644 index 0000000000..e71d6c23a4 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -0,0 +1,323 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import warnings +from onnx import helper +from qonnx.core.datatype import DataType +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation + +from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants +from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants +from finn.util.fpgadataflow import is_versal + + +def _determine_impl_style(node, fpgapart): + optype = node.op_type + + # check if there is an HLS or RTL variant or both + hls_variant = optype + "_hls" in hls_variants.keys() + rtl_variant = optype + "_rtl" in rtl_variants.keys() + + # check if user has specified a preferred_impl_style + inst = getCustomOp(node) + impl_style = inst.get_nodeattr("preferred_impl_style") + + # if impl_style not set, for "simple" layers always try + # to use rtl variant if available + if impl_style == "": + if optype == "StreamingDataWidthConverter": + return _dwc_determine_impl_style(node) + if rtl_variant: + if optype == "MVAU": + inp_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 + ) + weight_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 + ) + if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node): + return "rtl" + else: + return "hls" + elif optype == "VVAU": + inp_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 + ) + weight_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 + ) + if inp_width_fit and weight_width_fit and _vvu_rtl_possible(node, fpgapart): + return "rtl" + else: + return "hls" + return "rtl" + # but if no rtl variant, set impl_style to hls + elif hls_variant: + return "hls" + # if there is neither an rtl nor hls variant + # throw error + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + + # check if user setting can be fulfilled + # otherwise change impl_style + elif impl_style == "hls": + if optype == "ConvolutionInputGenerator": + if not _swg_hls_possible(node): + warn_str = ( + """Settings are not supported in HLS. Node %s will automatically be + set to RTL variant.""" + % node.name + ) + warnings.warn(warn_str) + return "rtl" + else: + return "hls" + + if hls_variant: + return "hls" + elif rtl_variant: + warn_str = """There is no HLS variant of %s. Node %s will automatically be + set to RTL variant.""" % ( + node.op_type, + node.name, + ) + warnings.warn(warn_str) + return "rtl" + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + elif impl_style == "rtl": + # rtl dwc does not support every inWidth to outWidth ratio + if optype == "StreamingDataWidthConverter": + if _dwc_determine_impl_style(node) != "rtl": + warn_str = """RTL implementation of DWC requires + stream widths that are integer width ratios + from each other. Node %s will automatically be + set to HLS variant.""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" + else: + # user setting can be fulfilled + return "rtl" + elif optype == "MVAU": + if _mvu_rtl_possible(node): + return "rtl" + else: + warn_str = """There is no RTL variant for %s. The node will automatically be + set to HLS variant. Please check the bit-widths to be <= 8 and ensure the + thresholds are implemented as standalone layer""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" + elif optype == "VVAU": + if _vvu_rtl_possible(node, fpgapart): + return "rtl" + else: + warn_str = """There is no RTL variant for %s. The node will automatically be + set to HLS variant. Please check the bit-widths to be <= 8 and ensure the + thresholds are implemented as standalone layer. Note that the RTL-variant + of this layer is only supported on Versal boards""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" + + if rtl_variant: + return "rtl" + elif hls_variant: + warn_str = """There is no RTL variant of %s. Node %s will automatically be + set to HLS variant.""" % ( + node.op_type, + node.name, + ) + warnings.warn(warn_str) + return "hls" + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + else: + raise Exception( + """Invalid value for attribute preferred_impl_style! Is currently set to: {} + has to be set to one of the following value ("hls", "rtl")""".format( + impl_style + ) + ) + + +def _dwc_determine_impl_style(node): + # when possible use rtl variant + dwc = getCustomOp(node) + dwc_in_width = dwc.get_nodeattr("inWidth") + dwc_out_width = dwc.get_nodeattr("outWidth") + # check if rtl variant can be used + iwidth_d = dwc_in_width % dwc_out_width == 0 + owidth_d = dwc_out_width % dwc_in_width == 0 + if iwidth_d or owidth_d: + return "rtl" + else: + return "hls" + + +def _swg_hls_possible(node): + # there are some constraints to + # the HLS variant of the SWG + # first constraint to check is + # if user has set dynamic_mode to 1 + # this is only supported in rtl variant + swg = getCustomOp(node) + if swg.get_nodeattr("dynamic_mode"): + return False + # the 2D HLS implementation for SWG + # can only be used for square inputs + # and no dilation + if swg.get_nodeattr("is1D"): + return True + else: + # extract all attributes to check + k = swg.get_nodeattr("ConvKernelDim") + ifm_dim = swg.get_nodeattr("IFMDim") + ofm_dim = swg.get_nodeattr("OFMDim") + s = swg.get_nodeattr("Stride") + d = swg.get_nodeattr("Dilation") + # check if square and dilation=1 + if ( + k[0] == k[1] + and ifm_dim[0] == ifm_dim[1] + and ofm_dim[0] == ofm_dim[1] + and s[0] == s[1] + and d[0] == d[1] == 1 + ): + return True + else: + return False + + +def _mvu_rtl_possible(n): + # Checks whether RTL-based MVU is supported + # Currently, for DSP48 we only support computations up to + # 8sx8u (8-bit signed weights x 8-bit (un)signed activations) + # and for DSP58 we support up to 8sx9s. Next to that, + # embedded thresholding functionality is not supported and + # neither binaryxnormode computation. + inp_width_in_range = ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 + ) or ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 + and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + ) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + not_binaryxnor_mode = getCustomOp(n).get_nodeattr("binaryXnorMode") == 0 + + return ( + inp_width_in_range + and weight_width_in_range + and signed_weights + and no_activation + and not_binaryxnor_mode + ) + + +def _vvu_rtl_possible(n, fpgapart): + # Checks whether RTL-based VVU is supported + # Currently, we only support RTL-VVU on DSP58 up to 8sx9s inputs + # (8-bit signed weights x (9-bit signed OR 8-bit (un)signed) activations). + # Next to that, embedded thresholding functionality is not supported. + in_width_in_range = ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 + ) or ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 + and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + ) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 + is_versal_family = is_versal(fpgapart) + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + + return ( + in_width_in_range + and weight_width_in_range + and signed_weights + and is_versal_family + and no_activation + ) + + +class SpecializeLayers(Transformation): + """Specialize all layers to either HLS or RTL variants""" + + def __init__(self, fpgapart=""): + super().__init__() + self.fpgapart = fpgapart + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + # Skip nodes that are not hw layers + if not node.domain == "finn.custom_op.fpgadataflow": + continue + node_ind += 1 + impl_style = _determine_impl_style(node, self.fpgapart) + optype = node.op_type + "_" + impl_style + + new_node = helper.make_node( + optype, + node.input, + node.output, + domain="finn.custom_op.fpgadataflow." + impl_style, + ) + # add all attributes + for attribute in node.attribute: + if attribute.name != "preferred_impl_style": + new_node.attribute.append(attribute) + if new_node.op_type == "MVAU_rtl": + is_versal_family = is_versal(self.fpgapart) + getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(node) + graph_modified = True + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index a102660001..da7624b8ff 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -49,6 +50,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir from . import templates @@ -381,7 +383,7 @@ def __init__( def apply(self, model): _check_vitis_envvars() # prepare at global level, then break up into kernels - prep_transforms = [InsertIODMA(512), InsertDWC()] + prep_transforms = [InsertIODMA(512), InsertDWC(), SpecializeLayers()] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) @@ -403,6 +405,7 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform(SpecializeLayers()) kernel_model = kernel_model.transform(RemoveUnusedTensors()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py index ed553e7cee..2e6639c5c6 100644 --- a/src/finn/transformation/move_reshape.py +++ b/src/finn/transformation/move_reshape.py @@ -1,22 +1,9 @@ import warnings from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation -from qonnx.util.basic import get_by_name, is_finn_op +from qonnx.util.basic import get_by_name - -def _is_fpgadataflow_node(node): - if node is not None: - if is_finn_op(node.domain): - n_backend = get_by_name(node.attribute, "backend") - if n_backend is None: - return False - backend_value = n_backend.s.decode("UTF-8") - if backend_value == "fpgadataflow": - return True - else: - return False - else: - return False +from finn.util.fpgadataflow import is_fpgadataflow_node class RemoveCNVtoFCFlatten(Transformation): @@ -34,10 +21,10 @@ def apply(self, model): oshape = model.get_tensor_shape(n.output[0]) if len(oshape) == 2 and ishape[0] == oshape[0]: producer = model.find_producer(n.input[0]) - if _is_fpgadataflow_node(producer) is True: + if is_fpgadataflow_node(producer): # standalone flatten, remove consumer = model.find_consumer(n.output[0]) - if _is_fpgadataflow_node(consumer) is True: + if is_fpgadataflow_node(consumer): graph_modified = True consumer.input[0] = n.input[0] graph.node.remove(n) @@ -48,9 +35,9 @@ def apply(self, model): perms = list(get_by_name(transp_node.attribute, "perm").ints) if perms == [0, 3, 1, 2]: producer = model.find_producer(transp_node.input[0]) - if _is_fpgadataflow_node(producer) is True: + if is_fpgadataflow_node(producer): consumer = model.find_consumer(n.output[0]) - if consumer.op_type == "MatrixVectorActivation": + if consumer.op_type.startswith("MVAU"): fc_inst = getCustomOp(consumer) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 1796738c58..1995d9f06a 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -11,7 +11,7 @@ # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # -# * Neither the name of Xilinx nor the names of its +# * Neither the name of FINN nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # @@ -30,6 +30,7 @@ import subprocess import sys import tempfile +from qonnx.util.basic import roundup_to_integer_multiple # test boards test_board_map = ["Pynq-Z1", "KV260_SOM", "ZCU104", "U250"] @@ -76,6 +77,11 @@ alveo_default_platform["U280"] = "xilinx_u280_gen3x16_xdma_1_202211_1" alveo_default_platform["U55C"] = "xilinx_u55c_gen3x16_xdma_3_202210_1" +# Create a joint part map, encompassing other boards too +part_map = {**pynq_part_map, **alveo_part_map} +part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S" +part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S" + def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable @@ -228,3 +234,57 @@ def is_exe(fpath): return exe_file return None + + +mem_primitives_versal = { + "URAM_72x4096": (72, 4096), + "URAM_36x8192": (36, 8192), + "URAM_18x16384": (18, 16384), + "URAM_9x32768": (9, 32768), + "BRAM18_36x512": (36, 512), + "BRAM18_18x1024": (18, 1024), + "BRAM18_9x2048": (9, 2048), + "LUTRAM": (1, 64), +} + + +def get_memutil_alternatives( + req_mem_spec, mem_primitives=mem_primitives_versal, sort_min_waste=True +): + """Computes how many instances of a memory primitive are necessary to + implement a desired memory size, where req_mem_spec is the desired + size and the primitive_spec is the primitve size. The sizes are expressed + as tuples of (mem_width, mem_depth). Returns a list of tuples of the form + (primitive_name, (primitive_count, efficiency, waste)) where efficiency in + range [0,1] indicates how much of the total capacity is utilized, and waste + indicates how many bits of storage are wasted. If sort_min_waste is True, + the list is sorted by increasing waste. + """ + ret = [ + (primitive_name, memutil(req_mem_spec, primitive_spec)) + for (primitive_name, primitive_spec) in mem_primitives.items() + ] + if sort_min_waste: + ret = sorted(ret, key=lambda x: x[1][2]) + return ret + + +def memutil(req_mem_spec, primitive_spec): + """Computes how many instances of a memory primitive are necessary to + implemented a desired memory size, where req_mem_spec is the desired + size and the primitive_spec is the primitve size. The sizes are expressed + as tuples of (mem_width, mem_depth). Returns (primitive_count, efficiency, waste) + where efficiency in range [0,1] indicates how much of the total capacity is + utilized, and waste indicates how many bits of storage are wasted.""" + + req_width, req_depth = req_mem_spec + prim_width, prim_depth = primitive_spec + + match_width = roundup_to_integer_multiple(req_width, prim_width) + match_depth = roundup_to_integer_multiple(req_depth, prim_depth) + count_width = match_width // prim_width + count_depth = match_depth // prim_depth + count = count_depth * count_width + eff = (req_width * req_depth) / (count * prim_width * prim_depth) + waste = (count * prim_width * prim_depth) - (req_width * req_depth) + return (count, eff, waste) diff --git a/src/finn/util/create.py b/src/finn/util/create.py index af92d1cb8e..09ec4f334c 100644 --- a/src/finn/util/create.py +++ b/src/finn/util/create.py @@ -143,7 +143,7 @@ def hls_mlp_maker(layer_spec): actval = 0 no_act = 1 FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, [current_out_name], domain="finn.custom_op.fpgadataflow", diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index 769ddb9465..3d3d343cd4 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -41,3 +41,39 @@ def is_fpgadataflow_node(node): is_node = True return is_node + + +def is_hls_node(node): + """Returns True if given node is hls node. Otherwise False.""" + is_node = False + if node is not None: + if node.domain == "finn.custom_op.fpgadataflow.hls": + n_backend = get_by_name(node.attribute, "backend") + if n_backend is not None: + backend_value = n_backend.s.decode("UTF-8") + if backend_value == "fpgadataflow": + is_node = True + + return is_node + + +def is_rtl_node(node): + """Returns True if given node is rtl node. Otherwise False.""" + is_node = False + if node is not None: + if node.domain == "finn.custom_op.fpgadataflow.rtl": + n_backend = get_by_name(node.attribute, "backend") + if n_backend is not None: + backend_value = n_backend.s.decode("UTF-8") + if backend_value == "fpgadataflow": + is_node = True + + return is_node + + +def is_versal(fpgapart): + """Returns whether board is part of the Versal family""" + return ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py index 318ba7045e..7486402be5 100644 --- a/src/finn/util/pyverilator.py +++ b/src/finn/util/pyverilator.py @@ -147,7 +147,7 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000): fifo_log = [] fifo_log_templ = ' results_file << "maxcount%s" << "\\t" ' fifo_log_templ += "<< to_string(top->maxcount%s) << endl;" - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl") fifo_ind = 0 for fifo_node in fifo_nodes: fifo_node = getCustomOp(fifo_node) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index db065fec42..94134967fa 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -60,7 +61,7 @@ from qonnx.util.cleanup import cleanup as qonnx_cleanup from shutil import copy -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.onnx_exec import execute_onnx @@ -85,6 +86,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline @@ -93,6 +95,7 @@ MoveScalarLinearPastInvariants, ) from finn.util.basic import get_finn_root, make_build_dir, test_board_map +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import ToTensor from finn.util.test import ( execute_parent, @@ -105,7 +108,7 @@ build_dir = os.environ["FINN_BUILD_DIR"] target_clk_ns = 20 -mem_mode = "decoupled" +mem_mode = "internal_decoupled" rtlsim_trace = False @@ -119,7 +122,7 @@ def get_checkpoint_name(topology, wbits, abits, step): def fold_tfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # (PE, SIMD, ramstyle) for each layer config = [(16, 49, "block"), (8, 8, "auto"), (8, 8, "auto"), (10, 8, "distributed")] for fcl, (pe, simd, ramstyle) in zip(fc_layers, config): @@ -127,17 +130,17 @@ def fold_tfc(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) - inp_qnt.set_nodeattr("mem_mode", "decoupled") inp_qnt.set_nodeattr("runtime_writeable_weights", 1) return model def fold_lfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # (PE, SIMD, ramstyle) for each layer config = [ (32, 49, "block"), @@ -151,15 +154,16 @@ def fold_lfc(model): fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("runtime_writeable_weights", 1) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) return model def fold_cnv_large(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ (16, 3), @@ -176,8 +180,9 @@ def fold_cnv_large(model): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -186,7 +191,7 @@ def fold_cnv_large(model): def fold_cnv_small(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ (8, 3, "distributed"), @@ -204,8 +209,9 @@ def fold_cnv_small(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -529,56 +535,56 @@ def test_streamline(self, topology, wbits, abits, board): model = model.transform(RemoveUnusedTensors()) model.save(get_checkpoint_name(topology, wbits, abits, "streamline")) - def test_convert_to_hls_layers(self, topology, wbits, abits, board): + def test_convert_to_hw_layers(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "streamline") model = load_test_checkpoint_or_skip(prev_chkpt_name) if topology == "tfc" and wbits == 1 and abits == 1: # use standalone thresholds for tfc-w1a1 to also exercise that option - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) # needed for non-bipolar MatMul layers - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) # TopK to LabelSelect - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) to standalone thresholding - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for convolutions if "fc" not in topology: - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataLayouts()) - model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers")) + model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers")) exp_layer_counts = { "tfc": [ ("Reshape", 1), - ("Thresholding_Batch", 1), - ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("Thresholding", 1), + ("MVAU", 4), + ("LabelSelect", 1), ], "tfc-1-1": [ ("Reshape", 1), - ("Thresholding_Batch", 4), - ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("Thresholding", 4), + ("MVAU", 4), + ("LabelSelect", 1), ], "lfc": [ ("Reshape", 1), - ("Thresholding_Batch", 1), - ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("Thresholding", 1), + ("MVAU", 4), + ("LabelSelect", 1), ], "cnv": [ ("Transpose", 1), - ("Thresholding_Batch", 1), + ("Thresholding", 1), ("ConvolutionInputGenerator", 6), - ("MatrixVectorActivation", 9), - ("StreamingMaxPool_Batch", 2), - ("LabelSelect_Batch", 1), + ("MVAU", 9), + ("StreamingMaxPool", 2), + ("LabelSelect", 1), ], } if topology == "tfc" and wbits == 1 and abits == 1: @@ -589,8 +595,67 @@ def test_convert_to_hls_layers(self, topology, wbits, abits, board): for op_type, exp_count in exp_layer_counts: assert len(model.get_nodes_by_op_type(op_type)) == exp_count + def test_specialize_layers(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers") + model = load_test_checkpoint_or_skip(prev_chkpt_name) + # set preferred impl style to hls for all layers + force_hls_boards = ["Pynq-Z1", "U250"] + if topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers")) + exp_layer_counts = { + "tfc": [ + ("Reshape", 1), + ("Thresholding_rtl", 1), + ("MVAU_hls", 4), + ("LabelSelect_hls", 1), + ], + "tfc-1-1": [ + ("Reshape", 1), + ("Thresholding_rtl", 4), + ("MVAU_hls", 4), + ("LabelSelect_hls", 1), + ], + "lfc": [ + ("Reshape", 1), + ("Thresholding_rtl", 1), + ("MVAU_hls", 4), + ("LabelSelect_hls", 1), + ], + "cnv": [ + ("Transpose", 1), + ("Thresholding_rtl", 1), + ("ConvolutionInputGenerator_rtl", 6), + ("MVAU_hls", 9), + ("StreamingMaxPool_hls", 2), + ("LabelSelect_hls", 1), + ], + "cnv-2-2": [ + ("Transpose", 1), + ("Thresholding_hls", 1), + ("ConvolutionInputGenerator_hls", 6), + ("MVAU_hls", 9), + ("StreamingMaxPool_hls", 2), + ("LabelSelect_hls", 1), + ], + } + if topology == "tfc" and wbits == 1 and abits == 1: + exp_key = "tfc-1-1" + elif topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: + exp_key = "cnv-2-2" + else: + exp_key = topology + exp_layer_counts = exp_layer_counts[exp_key] + for op_type, exp_count in exp_layer_counts: + assert len(model.get_nodes_by_op_type(op_type)) == exp_count + def test_create_dataflow_partition(self, topology, wbits, abits, board): - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers") + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "specialize_layers") model = load_test_checkpoint_or_skip(prev_chkpt_name) parent_model = model.transform(CreateDataflowPartition()) parent_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") @@ -662,7 +727,8 @@ def test_set_fifo_depths(self, topology, wbits, abits, board): ) else: model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) - fifo_layers = model.get_nodes_by_op_type("StreamingFIFO") + + fifo_layers = model.get_nodes_by_op_type("StreamingFIFO_rtl") assert len(fifo_layers) > 0 model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)) @@ -673,12 +739,13 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, board): model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(board, target_clk_ns)["part"] model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that - for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO"): + for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO_rtl"): getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py index 1cd38eb83a..9ee07d57a3 100644 --- a/tests/end2end/test_end2end_cybsec_mlp.py +++ b/tests/end2end/test_end2end_cybsec_mlp.py @@ -168,6 +168,7 @@ def test_end2end_cybsec_mlp_build(): # check the generated files assert os.path.isfile(output_dir + "/time_per_step.json") assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/driver/driver.py") est_cycles_report = output_dir + "/report/estimate_layer_cycles.json" assert os.path.isfile(est_cycles_report) @@ -181,8 +182,8 @@ def test_end2end_cybsec_mlp_build(): # examine the report contents with open(est_cycles_report, "r") as f: est_cycles_dict = json.load(f) - assert est_cycles_dict["MatrixVectorActivation_0"] == 80 - assert est_cycles_dict["MatrixVectorActivation_1"] == 64 + assert est_cycles_dict["MVAU_hls_0"] == 80 + assert est_cycles_dict["MVAU_hls_1"] == 64 with open(est_res_report, "r") as f: est_res_dict = json.load(f) assert est_res_dict["total"]["LUT"] == 7899.0 diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 2d25a2bf0d..cbf89c2eae 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -54,7 +55,7 @@ from qonnx.transformation.remove import RemoveIdentityOps from qonnx.util.cleanup import cleanup as qonnx_cleanup -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb import finn.transformation.streamline.reorder as reorder from finn.core.onnx_exec import execute_onnx @@ -62,8 +63,15 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( CreateDataflowPartition, ) +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul @@ -83,7 +91,6 @@ test_platform = alveo_default_platform[test_board] test_fpga_part = alveo_part_map[test_board] target_clk_ns = 3 -mem_mode = "decoupled" large_fifo_ram_style = "ultra" extra_fold = 1 first_layer_res_type = "dsp" @@ -211,29 +218,41 @@ def test_end2end_mobilenet_lowering(): @pytest.mark.end2end -def test_end2end_mobilenet_convert_to_hls_layers(): +@pytest.mark.xfail +def test_end2end_mobilenet_convert_to_hw_layers(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx") - model = model.transform(to_hls.InferPool_Batch()) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferVectorVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) - model = model.transform(to_hls.InferChannelwiseLinearLayer()) - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferThresholdingLayer()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model.save(build_dir + "/end2end_mobilenet_hls_layers.onnx") + model.save(build_dir + "/end2end_mobilenet_hw_layers.onnx") + + +@pytest.mark.end2end +def test_end2end_mobilenet_specialize_layers(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_layers.onnx") + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir + "/end2end_mobilenet_specialize_layers.onnx") @pytest.mark.end2end def test_end2end_mobilenet_folding(): - model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hls_layers.onnx") + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_specialize_layers.onnx") # optional extra folding to use fewer resources # applied while setting the attributes on each node assert extra_fold in [1, 2, 4] - # set up folding for the depthwise conv layers impl'd by VVAUs + # set up folding for the conv layers impl'd by MVAUs # each value is PE for a layer - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") + fc_layers += model.get_nodes_by_op_type("MVAU_rtl") # each tuple is (PE, SIMD, ram_style) for a layer folding = [ (32, 3, "block"), @@ -262,7 +281,8 @@ def test_end2end_mobilenet_folding(): getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type) # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer - vvau_layers = model.get_nodes_by_op_type("VectorVectorActivation") + vvau_layers = model.get_nodes_by_op_type("VVAU_hls") + vvau_layers += model.get_nodes_by_op_type("VVAU_rtl") folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8] for vvau, pe in zip(vvau_layers, folding): vvau_inst = getCustomOp(vvau) @@ -273,11 +293,11 @@ def test_end2end_mobilenet_folding(): convinputgen_inst.set_nodeattr("SIMD", pe // extra_fold) # set SIMD in preceeding FMPadding to same value padding = model.find_direct_predecessors(convinputgen)[0] - if padding.op_type == "FMPadding_Batch": + if padding.op_type == "FMPadding_hls": padding_inst = getCustomOp(padding) padding_inst.set_nodeattr("SIMD", pe // extra_fold) # adjust final pooling layer + its inpgen - pool_node = model.get_nodes_by_op_type("Pool_Batch")[0] + pool_node = model.get_nodes_by_op_type("Pool_hls")[0] pool_inst = getCustomOp(pool_node) pool_inst.set_nodeattr("PE", 4 // extra_fold) pool_inpgen = model.find_direct_predecessors(pool_node)[0] @@ -288,8 +308,16 @@ def test_end2end_mobilenet_folding(): @pytest.mark.end2end -def test_end2end_mobilenet_create_dataflow_partition(): +def test_end2end_mobilenet_minimize_bit_width(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx") + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") + + +@pytest.mark.end2end +def test_end2end_mobilenet_create_dataflow_partition(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") parent_model = model.transform(CreateDataflowPartition()) parent_model.save(build_dir + "/end2end_mobilenet_dataflow_parent.onnx") sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] @@ -305,7 +333,7 @@ def test_end2end_mobilenet_create_dataflow_partition(): @pytest.mark.end2end @pytest.mark.xfail def test_end2end_mobilenet_cppsim(): - model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx") + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") x = np.load(build_dir + "/end2end_mobilenet_input.npy") inp_name = model.graph.input[0].name out_name = model.graph.output[0].name diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py index f5edabbd4b..deb9dd43b4 100644 --- a/tests/fpgadataflow/test_code_gen_trafo.py +++ b/tests/fpgadataflow/test_code_gen_trafo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -50,10 +51,10 @@ def test_code_gen_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", node_inp_list, ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", code_gen_dir="", executable_path="", diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py index d04b68a56b..7022311d4c 100644 --- a/tests/fpgadataflow/test_compilation_trafo.py +++ b/tests/fpgadataflow/test_compilation_trafo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -51,10 +52,10 @@ def test_compilation_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", node_inp_list, ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", code_gen_dir="", executable_path="", diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py similarity index 85% rename from tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py rename to tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index 2af0957e12..c5d0281203 100644 --- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,7 +42,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -49,6 +50,8 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_fpgadataflow_node # conv_config: @@ -71,7 +74,7 @@ @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): +def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): pad, kernel_size, stride, dilation = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -85,9 +88,6 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] - if use_rtl_swg and exec_mode == "cppsim": - pytest.skip("cppsim not supported for RTL SWG") - if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, k_h, k_w] @@ -135,12 +135,23 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) + new_model = new_model.transform(to_hw.InferConvInpGen()) + if not use_rtl_swg: + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) else: - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0] + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) + # set folding parameters for MVAU + if new_model.get_nodes_by_op_type("MVAU_hls"): + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + else: + fc_node = new_model.get_nodes_by_op_type("MVAU_rtl")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") @@ -171,12 +182,12 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ assert oxe.compare_execution(model, new_model, inp_dict) if pad_h == 1 and pad_w == 1: - padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0] padding_inst = getCustomOp(padding_node) assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = new_model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py similarity index 76% rename from tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py rename to tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py index bb2c1d74c2..4b063f8505 100644 --- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,20 +38,21 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def prepare_inputs(input_tensor): return {"inp": input_tensor} -def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): +def make_single_channelwise_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape) p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape) @@ -87,7 +88,7 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, exec_mode): +def test_convert_to_hw_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, exec_mode): ifm_ch = 16 ifm_dim = 5 ishape = (1, ifm_ch, ifm_dim, ifm_dim) @@ -97,7 +98,7 @@ def test_convert_to_hls_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, pshape = (1, ifm_ch, 1, 1) np.random.seed(0) - model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) + model = make_single_channelwise_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) # Since the aren't Data types with a bit width of a non power of 2, # there are cases where the input won't use it full range. @@ -111,24 +112,32 @@ def test_convert_to_hls_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, input_dict = prepare_inputs(x) y_expected = oxe.execute_onnx(model, input_dict)["outp"] - new_model = model.transform(to_hls.InferChannelwiseLinearLayer()) - new_model = new_model.transform(GiveUniqueNodeNames()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(GiveUniqueNodeNames()) + + ctx_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + y_produced = ctx_produced["outp"] + + assert (y_produced == y_expected).all() + assert model.graph.node[1].op_type == "ChannelwiseOp" + + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": - new_model = new_model.transform(PrepareCppSim()) - new_model = new_model.transform(CompileCppSim()) - new_model = new_model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) elif exec_mode == "rtlsim": - new_model = new_model.transform(SetExecMode("rtlsim")) - new_model = new_model.transform(GiveUniqueNodeNames()) - new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) - new_model = new_model.transform(HLSSynthIP()) - new_model = new_model.transform(PrepareRTLSim()) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) else: raise Exception("Unknown exec_mode") - ctx_produced = oxe.execute_onnx(new_model, input_dict, return_full_exec_context=True) + ctx_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) y_produced = ctx_produced["outp"] assert (y_produced == y_expected).all() - assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch" + assert model.graph.node[1].op_type == "ChannelwiseOp_hls" diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py similarity index 89% rename from tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py rename to tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py index 94007bdd14..f7b3c55c2a 100755 --- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,6 +35,7 @@ from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames, RemoveUnusedTensors from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_datatypes import InferDataTypes @@ -42,14 +44,16 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants +from finn.util.fpgadataflow import is_fpgadataflow_node def get_multithreshold_rand_params(channels, num_of_thres, seed=None): @@ -78,7 +82,7 @@ def get_multithreshold_rand_params(channels, num_of_thres, seed=None): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): +def test_convert_to_hw_conv_fc_transition(conv_config, depthwise, use_reshape): np.random.seed(0) idt = DataType["UINT4"] odt = DataType["UINT4"] @@ -187,15 +191,20 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): new_model = new_model.transform(InferDataLayouts()) new_model = new_model.transform(RemoveUnusedTensors()) - # convert_to_hls + # convert_to_hw if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - new_model = new_model.transform(to_hls.InferThresholdingLayer()) - new_model = new_model.transform(to_hls.InferConvInpGen()) - new_model = new_model.transform(to_hls.InferStreamingMaxPool()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(to_hw.InferThresholdingLayer()) + new_model = new_model.transform(to_hw.InferConvInpGen()) + new_model = new_model.transform(to_hw.InferStreamingMaxPool()) new_model = new_model.transform(RemoveCNVtoFCFlatten()) new_model = new_model.transform(absorb.AbsorbConsecutiveTransposes()) + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + new_model = new_model.transform(SpecializeLayers()) new_model = new_model.transform(GiveUniqueNodeNames()) new_model = new_model.transform(InferDataLayouts()) diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py similarity index 83% rename from tests/fpgadataflow/test_convert_to_hls_conv_layer.py rename to tests/fpgadataflow/test_convert_to_hw_conv_layer.py index 95beffafac..61f8af7806 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,7 +42,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -49,6 +50,8 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_fpgadataflow_node # conv_config kernel_size,stride, pad @@ -62,7 +65,7 @@ @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): +def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): kernel_size, stride, pad = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -71,7 +74,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod in_chn = 16 if use_rtl_swg and exec_mode == "cppsim": - pytest.skip("cppsim not supported for RTL SWG") + pytest.skip("Skip cppsim if SWG in rtl") if depthwise is True: group = out_chn = in_chn @@ -120,12 +123,23 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) + new_model = new_model.transform(to_hw.InferConvInpGen()) + if not use_rtl_swg: + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) else: - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0] + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) + # set folding parameters for MVAU + if new_model.get_nodes_by_op_type("MVAU_hls"): + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + else: + fc_node = new_model.get_nodes_by_op_type("MVAU_rtl")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") @@ -156,9 +170,9 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod assert oxe.compare_execution(model, new_model, inp_dict) if not use_rtl_swg and kernel_size == 1 and stride > 1 and pad == 0: - assert new_model.graph.node[1].op_type == "DownSampler" + assert new_model.graph.node[1].op_type == "DownSampler_hls" if exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("DownSampler")[0] + node = new_model.get_nodes_by_op_type("DownSampler_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) @@ -170,12 +184,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod if use_rtl_swg: padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0] else: - padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_node = new_model.get_nodes_by_op_type("FMPadding_hls")[0] padding_inst = getCustomOp(padding_node) assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = new_model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py similarity index 82% rename from tests/fpgadataflow/test_convert_to_hls_layers_cnv.py rename to tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index c9cb4f0802..71f383ca23 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,24 +49,26 @@ from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.test import get_test_model_trained -export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx" +export_onnx_path_cnv = "test_convert_to_hw_layers_cnv.onnx" @pytest.mark.fpgadataflow @pytest.mark.vivado # Standalone or fused thresholding-based activation @pytest.mark.parametrize("fused_activation", [True, False]) -def test_convert_to_hls_layers_cnv_w1a1(fused_activation): +def test_convert_to_hw_layers_cnv_w1a1(fused_activation): cnv = get_test_model_trained("CNV", 1, 1) export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv) qonnx_cleanup(export_onnx_path_cnv, out_file=export_onnx_path_cnv) @@ -95,16 +98,24 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): expected_ctx = oxe.execute_onnx(model, input_dict, True) expected = expected_ctx[model.graph.output[0].name] - # if we infer thresholding first, all MultiThresholds get converted to HLS + # if we infer thresholding first, all MultiThresholds get converted to HW # subsequently, the FC inference will generate passthrough MVAUs if not fused_activation: - model = model.transform(to_hls.InferThresholdingLayer()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferThresholdingLayer()) + + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) for node in model.graph.node: - if node.op_type == "MatrixVectorActivation": + if node.op_type == "MVAU_hls": inst = getCustomOp(node) - inst.set_nodeattr("mem_mode", "decoupled") + inst.set_nodeattr("mem_mode", "internal_decoupled") mw = inst.get_nodeattr("MW") mh = inst.get_nodeattr("MH") if mh % 4 == 0: @@ -117,25 +128,23 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): else: simd = mw inst.set_nodeattr("SIMD", simd) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) # check topology status finn_nodes = model.get_finn_nodes() if fused_activation: assert len(finn_nodes) == 18 else: assert len(finn_nodes) == 26 - thr_nodes = model.get_nodes_by_op_type("Thresholding_Batch") + thr_nodes = model.get_nodes_by_op_type("Thresholding_hls") assert len(thr_nodes) == 8 non_finn_nodes = model.get_non_finn_nodes() assert len(non_finn_nodes) == 5 exp_non_finn_nodes = ["Transpose", "Transpose", "Reshape", "Mul", "Add"] assert [x.op_type for x in non_finn_nodes] == exp_non_finn_nodes - fc_nodes = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_nodes = model.get_nodes_by_op_type("MVAU_hls") assert len(fc_nodes) == 9 - swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") assert len(swg_nodes) == 6 - mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_Batch") + mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_hls") assert len(mp_nodes) == 2 model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py similarity index 89% rename from tests/fpgadataflow/test_convert_to_hls_layers_fc.py rename to tests/fpgadataflow/test_convert_to_hw_layers_fc.py index 8a7b2509a4..746ded9074 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,22 +49,23 @@ from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.test import get_test_model_trained -export_onnx_path = "test_convert_to_hls_layers_fc.onnx" +export_onnx_path = "test_convert_to_hw_layers_fc.onnx" @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_layers_tfc_w1a1(): +def test_convert_to_hw_layers_tfc_w1a1(): tfc = get_test_model_trained("TFC", 1, 1) export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path) qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) @@ -79,24 +81,25 @@ def test_convert_to_hls_layers_tfc_w1a1(): model = model.transform(absorb.AbsorbAddIntoMultiThreshold()) model = model.transform(absorb.AbsorbMulIntoMultiThreshold()) model = model.transform(RoundAndClipThresholds()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation" + assert fc0.op_type.startswith("MVAU") assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 1] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation" + assert fc1.op_type.startswith("MVAU") assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 1] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation" + assert fc2.op_type.startswith("MVAU") assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 1] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation" + assert fc3.op_type.startswith("MVAU") assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] @@ -137,7 +140,7 @@ def test_convert_to_hls_layers_tfc_w1a1(): @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_layers_tfc_w1a2(): +def test_convert_to_hw_layers_tfc_w1a2(): tfc = get_test_model_trained("TFC", 1, 2) export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path) qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) @@ -150,29 +153,26 @@ def test_convert_to_hls_layers_tfc_w1a2(): model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(Streamline()) - from finn.transformation.fpgadataflow.convert_to_hls_layers import ( - InferQuantizedMatrixVectorActivation, - ) - - model = model.transform(InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation" + assert fc0.op_type.startswith("MVAU") assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 2] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation" + assert fc1.op_type.startswith("MVAU") assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 2] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation" + assert fc2.op_type.startswith("MVAU") assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 2] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation" + assert fc3.op_type.startswith("MVAU") assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] fc0w = getCustomOp(fc0) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py similarity index 82% rename from tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py rename to tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py index f8e566156b..6c83f10617 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,10 +46,11 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.streamline.absorb import ( AbsorbConsecutiveTransposes, AbsorbScalarMulAddIntoTopK, @@ -67,7 +68,7 @@ export_onnx_path = "test_output_synthetic.onnx" # construct a synthetic graph to test: -# topk insertion, topk conversion to hls, add conversion to hls +# topk insertion, topk conversion to hw, add conversion to hw # graph should just be a sum @@ -136,7 +137,7 @@ def make_model(ch, ifmdim): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): +def test_convert_to_hw_layers_synthetic(ch, ifmdim, idt): model = make_model(ch, ifmdim) model.save(export_onnx_path) model = ModelWrapper(export_onnx_path, fix_float64=True) @@ -145,7 +146,6 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataLayouts()) - # model.save("golden.onnx") # generate test vectors of correct shape if ifmdim == -1: input_tensor_shape = (1, ch) @@ -166,7 +166,7 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(InferDataLayouts()) - # convert to hls + # convert to hw model.set_tensor_datatype(model.graph.input[0].name, idt) # extra streamlining model = model.transform(MoveScalarLinearPastInvariants()) @@ -179,35 +179,52 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(InferDataLayouts()) model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferChannelwiseLinearLayer()) - model = model.transform(to_hls.InferAddStreamsLayer()) - model = model.transform(to_hls.InferGlobalAccPoolLayer()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferAddStreamsLayer()) + model = model.transform(to_hw.InferGlobalAccPoolLayer()) model = model.transform(MoveScalarLinearPastInvariants()) model = model.transform(InsertTopK()) model = model.transform(AbsorbScalarMulAddIntoTopK()) model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(AbsorbConsecutiveTransposes()) model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferLabelSelectLayer()) - model = model.transform(to_hls.InferDuplicateStreamsLayer()) + model = model.transform(to_hw.InferDuplicateStreamsLayer()) model = model.transform(SortGraph()) - # model.save("golden_hls.onnx") # check topology status finn_nodes = model.get_finn_nodes() assert len(finn_nodes) == 9 - add_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + add_nodes = model.get_nodes_by_op_type("AddStreams") assert len(add_nodes) == 1 - pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_Batch") + pool_nodes = model.get_nodes_by_op_type("GlobalAccPool") assert len(pool_nodes) == 1 - label_nodes = model.get_nodes_by_op_type("LabelSelect_Batch") + label_nodes = model.get_nodes_by_op_type("LabelSelect") assert len(label_nodes) == 1 - channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_Batch") + channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp") assert len(channelwise_nodes) == 5 - dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_Batch") + dup_nodes = model.get_nodes_by_op_type("DuplicateStreams") + assert len(dup_nodes) == 1 + + output_hw = oxe.execute_onnx(model, input_dict, True) + + model = model.transform(SpecializeLayers()) + + # check topology status + + finn_nodes = model.get_finn_nodes() + assert len(finn_nodes) == 9 + add_nodes = model.get_nodes_by_op_type("AddStreams_hls") + assert len(add_nodes) == 1 + pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_hls") + assert len(pool_nodes) == 1 + label_nodes = model.get_nodes_by_op_type("LabelSelect_hls") + assert len(label_nodes) == 1 + channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_hls") + assert len(channelwise_nodes) == 5 + dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_hls") assert len(dup_nodes) == 1 model = model.transform(PrepareCppSim()) @@ -215,7 +232,13 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(SetExecMode("cppsim")) output_dict = oxe.execute_onnx(model, input_dict, True) - produced_topk_hls = output_dict[model.graph.output[0].name] + + # verify execution + outp_name = model.graph.output[0].name + # comparison before and after layer specialization + assert (output_dict[outp_name] == output_hw[outp_name]).all() + # comparison with golden output + produced_topk_hls = output_dict[outp_name] topk_input = output_dict[model.graph.node[-1].input[0]] assert soft_verify_topk(topk_input, produced_topk_hls, 5) diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py similarity index 87% rename from tests/fpgadataflow/test_convert_to_hls_pool_batch.py rename to tests/fpgadataflow/test_convert_to_hw_pool_batch.py index 417b4fbae2..d532cf345e 100644 --- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py +++ b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,7 +38,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -46,6 +46,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False): @@ -133,7 +134,7 @@ def prepare_inputs(input_tensor): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode): +def test_convert_to_hw_pool(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode): k, stride, pad, ifm_dim = pool_config if ifm_ch % pe != 0: @@ -156,10 +157,6 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e # prepare input data input_dict = prepare_inputs(x) if op_type == "MaxPool": - # if idt.signed(): - # pytest.skip("""No support for signed input (see accu initialization - # in Pool_batch HLSLIB function). Skipping""") - if idt != odt: pytest.skip("Skipping Maxpool with idt != odt") @@ -178,16 +175,25 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e y_expected = oxe.execute_onnx(model, input_dict)["outp"] - new_model = model.transform(to_hls.InferPool_Batch()) + new_model = model.transform(to_hw.InferPool()) new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(to_hw.InferConvInpGen()) + # to test cppsim, set preferred_impl_style for swg to hls + inst = getCustomOp(new_model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) + inst.set_nodeattr("preferred_impl_style", "hls") + if pad != 0: + inst = getCustomOp(new_model.get_nodes_by_op_type("FMPadding")[0]) + inst.set_nodeattr("preferred_impl_style", "hls") + y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] + assert (y_produced == y_expected).all() + new_model = new_model.transform(SpecializeLayers()) - new_model = new_model.transform(to_hls.InferConvInpGen()) # Folding for n in new_model.graph.node: if n.op_type.startswith("ConvolutionInputGenerator"): inst = getCustomOp(n) inst.set_nodeattr("SIMD", pe) - elif n.op_type == "Pool_Batch": + elif n.op_type.startswith("Pool"): inst = getCustomOp(n) inst.set_nodeattr("PE", pe) @@ -196,14 +202,14 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e assert len(new_model.graph.node) == 4 assert new_model.graph.node[0].op_type == "Transpose" assert new_model.graph.node[1].op_type.startswith("ConvolutionInputGenerator") - assert new_model.graph.node[2].op_type == "Pool_Batch" + assert new_model.graph.node[2].op_type.startswith("Pool") assert new_model.graph.node[3].op_type == "Transpose" else: assert len(new_model.graph.node) == 5 assert new_model.graph.node[0].op_type == "Transpose" - assert new_model.graph.node[1].op_type == "FMPadding_Batch" + assert new_model.graph.node[1].op_type.startswith("FMPadding") assert new_model.graph.node[2].op_type.startswith("ConvolutionInputGenerator") - assert new_model.graph.node[3].op_type == "Pool_Batch" + assert new_model.graph.node[3].op_type.startswith("Pool") assert new_model.graph.node[4].op_type == "Transpose" else: # not currently converted to HLS, node stays as-is @@ -230,7 +236,7 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("Pool_Batch")[0] + node = new_model.get_nodes_by_op_type("Pool_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py new file mode 100755 index 0000000000..63cb5986e1 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -0,0 +1,205 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + +test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 + + +# Helper functions +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC(FINN) to NCHW(Standard) +def layout_FINN2NCHW(data): + return np.transpose(data, (0, 3, 1, 2)) + + +# Convert from NCHW(Standard) to NHWC(FINN) +def layout_NCHW2FINN(data): + return np.transpose(data, (0, 2, 3, 1)) + + +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def generate_pe_value(fold, num_input_channels): + if fold == -1: + fold = num_input_channels + pe = num_input_channels // fold + assert num_input_channels % pe == 0 + return pe + + +def make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, +): + NumChannels = thresholds.shape[0] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [NumChannels]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [NumChannels]) + + node_inp_list = ["inp", "thresh"] + + Multithresholding_node = helper.make_node( + "MultiThreshold", + node_inp_list, + ["outp"], + domain="qonnx.custom_op.general", + out_dtype=output_data_type.name, + out_bias=float(activation_bias), + out_scale=1.0, + ) + + graph = helper.make_graph( + nodes=[Multithresholding_node], + name="multithresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="multithresholding-model") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + + model.set_tensor_datatype("inp", input_data_type) + model.set_tensor_datatype("outp", output_data_type) + + model.set_tensor_datatype("thresh", input_data_type) + model.set_initializer("thresh", thresholds) + return model + + +# N.B. Fold values where C % PE != 0 fail +@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) +@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) +@pytest.mark.parametrize("num_input_channels", [16]) +@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_convert_multithreshold_to_hardware( + impl_style, + activation, + input_data_type, + fold, + num_input_channels, +): + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = activation.get_num_possible_values() - 1 + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = activation + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + + # provide non-decreasing/ascending thresholds + thresholds = sort_thresholds_increasing(thresholds) + + # Make a Multithreshold graph and convert to thresholding binary search node + model = make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + model = model.transform(InferThresholdingLayer()) + + # Perform functional validation of the InferThresholdingLayer transform + x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels])) + + x_nchw = layout_FINN2NCHW(x) + y_expected = multithreshold(x_nchw, thresholds) + + # convert back to NHWC for comparison to hw outputs + y_expected = layout_NCHW2FINN(y_expected) + if activation == DataType["BIPOLAR"]: + # binary to bipolar + y_expected = 2 * y_expected - 1 + else: + # signed offset + y_expected += activation.min() + + input_dict = prepare_inputs(x) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + assert (y_produced == y_expected).all() + + # Transform to the specified implementation style, either the + # RTL or HLS according to test parameters + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", impl_style) + model = model.transform(SpecializeLayers()) + model = model.transform(InferShapes()) + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index 2ffd696528..b8242df933 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ import finn.core.onnx_exec as oxe from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import ( +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( InferConvInpGen, InferVectorVectorActivation, ) @@ -54,6 +55,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): @@ -166,7 +168,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): +def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): idt = wdt = DataType["INT4"] ifm_dim = 6 ifm_ch = 4 @@ -180,13 +182,14 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - # set SIMD in ConvInputGen node and PE in VVAU node + new_model = new_model.transform(SpecializeLayers()) + # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator": + if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation": + elif n.op_type.startswith("VVAU"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) new_model = new_model.transform(SetExecMode("cppsim")) @@ -209,7 +212,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): +def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding): idt = wdt = DataType["INT4"] ifm_dim = 6 ifm_ch = 4 @@ -223,13 +226,14 @@ def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - # set SIMD in ConvInputGen node and PE in VVAU node + new_model = new_model.transform(SpecializeLayers()) + # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator": + if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation": + elif n.op_type.startswith("VVAU"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py index f3716dea9b..338204c0c7 100644 --- a/tests/fpgadataflow/test_fifosizing.py +++ b/tests/fpgadataflow/test_fifosizing.py @@ -76,7 +76,6 @@ def test_fifosizing_linear(method, topology): build_cfg.DataflowOutputType.STITCHED_IP, build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, ], - default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, ) build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index 1ad2c26610..530d94e13b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,6 +44,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_addstreams_modelwrapper(ch, pe, idt): @@ -52,7 +53,7 @@ def make_addstreams_modelwrapper(ch, pe, idt): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]) addstreams_node = helper.make_node( - "AddStreams_Batch", + "AddStreams", ["inp1", "inp2"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -60,6 +61,7 @@ def make_addstreams_modelwrapper(ch, pe, idt): NumChannels=ch, PE=pe, inputDataType=idt.name, + preferred_impl_style="hls", ) graph = helper.make_graph( nodes=[addstreams_node], @@ -104,6 +106,18 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): model = make_addstreams_modelwrapper(ch, pe, idt) + # prepare input data + input_dict = prepare_inputs(x1, x2) + oshape = model.get_tensor_shape("outp") + y = x1 + x2 + y_expected = y.reshape(oshape) + + # test verification flow before specializing layer + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all(), "Execution of hw layer failed" + + model = model.transform(SpecializeLayers()) + if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -117,12 +131,6 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data - input_dict = prepare_inputs(x1, x2) - - oshape = model.get_tensor_shape("outp") - y = x1 + x2 - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) @@ -130,7 +138,7 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("AddStreams_Batch")[0] + node = model.get_nodes_by_op_type("AddStreams_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py index 186a6af42c..d5fa7c779f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +46,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): @@ -56,7 +58,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): node_inp_list = ["inp", "const"] node = helper.make_node( - "ChannelwiseOp_Batch", + "ChannelwiseOp", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -68,6 +70,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): outputDataType=odt.name, paramDataType=pdt.name, numInputVectors=vecs, + preferred_impl_style="hls", ) graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp]) @@ -109,13 +112,35 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m # generate input and param data x = gen_finn_dt_tensor(idt, tuple(vecs + [ich])) - # C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32) C = gen_finn_dt_tensor(pdt, (ich)) odt = act + # create model model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs) + # package input data as dictionary + input_dict = {"inp": x} + + oshape = model.get_tensor_shape("outp") + + C_reshaped = np.broadcast_to(C.flatten(), x.shape) + if func == "add": + y = x + C_reshaped + elif func == "mul": + y = x * C_reshaped + + y_expected = y.reshape(oshape) + + # verify hw abstraction layer + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "HW layer execution failed" + + model = model.transform(SpecializeLayers()) + if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -129,30 +154,18 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m else: raise Exception("Unknown exec_mode") - # package input data as dictionary - input_dict = {"inp": x} - - oshape = model.get_tensor_shape("outp") - - C_reshaped = np.broadcast_to(C.flatten(), x.shape) - if func == "add": - y = x + C_reshaped - elif func == "mul": - y = x * C_reshaped - - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "ChannelwiseOp_Batch_0" in hls_synt_res_est + assert "ChannelwiseOp_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("ChannelwiseOp_Batch")[0] + node = model.get_nodes_by_op_type("ChannelwiseOp_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index 403bb328ae..34a48996c9 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -70,10 +72,10 @@ def create_two_fc_model(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["inp", "w0"], ["mid"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -85,14 +87,14 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode="internal_decoupled", ) fc1 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["mid", "w1"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -104,7 +106,7 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode="internal_decoupled", ) graph = helper.make_graph( @@ -151,7 +153,7 @@ def test_fpgadataflow_checksum(): model = model.transform(InferShapes()) assert ( - len(model.get_nodes_by_op_type("CheckSum")) == 2 + len(model.get_nodes_by_op_type("CheckSum_hls")) == 2 ), """Insertion of checksum layers was unsuccessful""" @@ -166,14 +168,15 @@ def test_fpgadataflow_checksum(): model = model.transform(CompileCppSim()) inp = {"global_in": x} y_cppsim = oxe.execute_onnx(model, inp, return_full_exec_context=True) - checksum0_cppsim = y_cppsim["CheckSum_0_out1"] - checksum1_cppsim = y_cppsim["CheckSum_1_out1"] + checksum0_cppsim = y_cppsim["CheckSum_hls_0_out1"] + checksum1_cppsim = y_cppsim["CheckSum_hls_1_out1"] # in this test case scenario the checksums are equal assert checksum0_cppsim == checksum1_cppsim, "CheckSums are not equal" # rtlsim model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) @@ -187,7 +190,7 @@ def test_fpgadataflow_checksum(): def read_checksum_and_drain(sim): chk_addr = 16 drain_addr = 32 - for i in range(len(model.get_nodes_by_op_type("CheckSum"))): + for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) checksums.append(axilite_read(sim, chk_addr, basename=axi_name)) drain.append(axilite_read(sim, drain_addr, basename=axi_name)) @@ -196,7 +199,7 @@ def read_checksum_and_drain(sim): def write_drain(sim): addr = 32 - for i in range(len(model.get_nodes_by_op_type("CheckSum"))): + for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) axilite_write(sim, addr, drain_value, basename=axi_name) diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py index 2b2069a72b..b52b14fca3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_concat.py +++ b/tests/fpgadataflow/test_fpgadataflow_concat.py @@ -1,4 +1,5 @@ # Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,7 +41,7 @@ from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferConcatLayer +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferConcatLayer from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO @@ -48,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_concat_model(i_shapes, idt): @@ -90,10 +92,15 @@ def test_fpgadataflow_concat(exec_mode, idt): inp_dict[model.graph.input[i].name] = i_data[i] ret = execute_onnx(model, inp_dict) assert (ret[oname] == exp_out).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW and verify conversion model = model.transform(InferConcatLayer()) assert model.graph.node[0].op_type == "StreamingConcat" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" + ret = execute_onnx(model, inp_dict) + assert (ret[oname] == exp_out).all() + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "StreamingConcat_hls" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" if exec_mode == "cppsim": model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) @@ -130,11 +137,15 @@ def test_fpgadataflow_concat_stitchedip(): inp_dict[model.graph.input[i].name] = i_data[i] ret = execute_onnx(model, inp_dict) assert (ret[oname] == exp_out).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW and verify conversion model = model.transform(InferConcatLayer()) assert model.graph.node[0].op_type == "StreamingConcat" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "StreamingConcat_hls" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(fpga_part, clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index d94b5d6399..45ca74fbea 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,11 +33,13 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -44,26 +47,34 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt): +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch] + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] ) im2col_node = helper.make_node( "Im2Col", ["inp"], ["outp"], - domain="qonnx.custom_op.general", - stride=[stride, stride], - kernel_size=[k, k], - input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)), + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], pad_amount=[0, 0, 0, 0], pad_value=0, - dilations=[dilation, dilation], + depthwise=dw, ) graph = helper.make_graph( nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] @@ -78,81 +89,117 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, d return model -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 -): - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k, k], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim, ifm_dim], - OFMDim=[ofm_dim, ofm_dim], - SIMD=simd, - Stride=[stride, stride], - Dilation=[dilation, dilation], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - def prepare_inputs(input_tensor): return {"inp": input_tensor} # input datatype -@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT2"]]) +@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["UINT4"]]) # kernel size -@pytest.mark.parametrize("k", [2, 3]) +@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [6, 8]) +@pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]]) # input channels @pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride -@pytest.mark.parametrize("stride", [1, 2]) +@pytest.mark.parametrize("stride", [[1, 1], [2, 2], [2, 1]]) # Dilation -# Currently only dilation value of 1 is supported -@pytest.mark.parametrize("dilation", [1]) +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2], [2, 1]]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 2]) +@pytest.mark.parametrize("simd", [1, 2, 4]) # depthwise @pytest.mark.parametrize("dw", [0, 1]) +# parallel_window enable (MMV_out = M*K) +@pytest.mark.parametrize("parallel_window", [0, 1]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +# implementation style +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw): - ofm_dim = int(((ifm_dim - k) / stride) + 1) +def test_fpgadataflow_slidingwindow( + idt, + k, + ifm_dim, + ifm_ch, + stride, + dilation, + exec_mode, + simd, + dw, + parallel_window, + m, + flip, + impl_style, +): + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + pytest.skip("Dimension flip would have no effect") + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1): + pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim") + if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)): + pytest.skip("Not all combinations for stride > k edge case supported in default mode") + if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)): + pytest.skip("Parallel window requires SIMD=C for non-depthwise case") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + # prepare input data + input_dict = prepare_inputs(x) + model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw) + y_expected = oxe.execute_onnx(model, input_dict)["outp"] - x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw - ) + model = model.transform(to_hw.InferConvInpGen()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + # set impl_style + inst = getCustomOp(model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) + inst.set_nodeattr("preferred_impl_style", impl_style) + model = model.transform(SpecializeLayers()) + # set simd + inst = getCustomOp(model.graph.node[0]) + inst.set_nodeattr("SIMD", simd) + optype = model.graph.node[0].op_type + if optype == "ConvolutionInputGenerator_rtl": + inst.set_nodeattr("parallel_window", parallel_window) + inst.set_nodeattr("M", m) + if optype == "ConvolutionInputGenerator_hls": + if inst.get_nodeattr("is1D"): + inst.set_nodeattr("parallel_window", parallel_window) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) @@ -167,28 +214,26 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, dilation, e else: raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") - # prepare input data - input_dict = prepare_inputs(x) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] if dw == 0: assert (y_produced == y_expected).all() else: - y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) assert (y_produced == y_expected).all() - if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[0] - inst = getCustomOp(node) - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) - assert exp_cycles != 0 + if exec_mode == "rtlsim" and impl_style == "hls": + nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") + if nodes: + node = nodes[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 + else: + assert model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl" diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py deleted file mode 100644 index aa89dde5e7..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import numpy as np -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - -fpga_part = "xczu3eg-sbva484-1-e" - - -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - im2col_node = helper.make_node( - "Im2Col", - ["inp"], - ["outp"], - domain="qonnx.custom_op.general", - stride=[stride_h, stride_w], - kernel_size=[k_h, k_w], - input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), - dilations=[dilation_h, dilation_w], - pad_amount=[0, 0, 0, 0], - pad_value=0, - ) - graph = helper.make_graph( - nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] - ) - - model = qonnx_make_model(graph, producer_name="im2col-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, parallel_window, dw=0 -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator1D", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim_h, ifm_dim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=simd, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - parallel_window=parallel_window, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -# input datatype -# @pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT8"]]) -@pytest.mark.parametrize("idt", [DataType["INT8"]]) -# kernel size -@pytest.mark.parametrize("k", [[4, 1]]) -# input dimension -@pytest.mark.parametrize("ifm_dim", [[10, 1]]) -# input channels -@pytest.mark.parametrize("ifm_ch", [1, 4]) -# Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) -# Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 1]]) -# execution mode -@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) -# input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 4]) -# depthwise -@pytest.mark.parametrize("dw", [0, 1]) -# Flip dimensions -@pytest.mark.parametrize("flip", [False, True]) -# Use parallel window output variant -@pytest.mark.parametrize("parallel_window", [False, True]) -@pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado -def test_fpgadataflow_slidingwindow_1d( - idt, - k, - ifm_dim, - ifm_ch, - stride, - dilation, - exec_mode, - simd, - dw, - flip, - parallel_window, -): - if flip: - k = k[::-1] - ifm_dim = ifm_dim[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1): - pytest.skip( - """Dilation value greater than 1 and stride greater than 1 - currently not supported for 1D convolutions""" - ) - if (dilation_h > 1 or dilation_w > 1) and dw == 0: - pytest.skip( - """Dilation value greater than 1 currently not supported - for non-dws 1D convolutions""" - ) - if simd > ifm_ch: - pytest.skip("SIMD cannot be larger than number of input channels") - - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) - ofm_dim = [ofm_dim_h, ofm_dim_w] - - x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - stride=stride, - dilation=dilation, - idt=idt, - parallel_window=parallel_window, - dw=dw, - ) - - if exec_mode == "cppsim": - model = model.transform(SetExecMode("cppsim")) - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - elif exec_mode == "rtlsim": - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(fpga_part, 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - else: - raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") - - # prepare input data - input_dict = prepare_inputs(x) - # execute model - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - stride=stride, - dilation=dilation, - idt=idt, - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - - if dw == 0: - assert (y_produced == y_expected).all() - else: - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) - y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) - assert (y_produced == y_expected).all() - - if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")[0] - inst = getCustomOp(node) - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) - assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py deleted file mode 100755 index 4b6f9f4913..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - - -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - im2col_node = helper.make_node( - "Im2Col", - ["inp"], - ["outp"], - domain="finn.custom_op.general", - stride=[stride_h, stride_w], - kernel_size=[k_h, k_w], - input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), - dilations=[dilation_h, dilation_w], - pad_amount=[0, 0, 0, 0], - pad_value=0, - ) - graph = helper.make_graph( - nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] - ) - - model = qonnx_make_model(graph, producer_name="im2col-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0 -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator_rtl", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim_h, ifm_dim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=simd, - M=m, - parallel_window=parallel_window, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -# input datatype -@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["UINT4"]]) -# kernel size -@pytest.mark.parametrize("k", [[3, 3], [1, 5]]) -# input dimension -@pytest.mark.parametrize("ifm_dim", [[13, 13], [1, 21]]) -# input channels -@pytest.mark.parametrize("ifm_ch", [6]) -# Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) -# Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) -# depthwise -@pytest.mark.parametrize("dw", [0, 1]) -# input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 3, 6]) -# parallel_window enable (MMV_out = M*K) -@pytest.mark.parametrize("parallel_window", [0, 1]) -# in/out MMV ("M") -@pytest.mark.parametrize("m", [1]) -# Flip dimensions -@pytest.mark.parametrize("flip", [False]) -@pytest.mark.slow -@pytest.mark.vivado -@pytest.mark.fpgadataflow -def test_fpgadataflow_slidingwindow_rtl( - idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip -): - if flip: - if ( - ifm_dim[0] == ifm_dim[1] - and k[0] == k[1] - and stride[0] == stride[1] - and dilation[0] == dilation[1] - ): - pytest.skip("Dimension flip would have no effect") - k = k[::-1] - ifm_dim = ifm_dim[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation - kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation - - if simd > ifm_ch: - pytest.skip("SIMD cannot be larger than number of input channels") - if ifm_ch % simd != 0: - pytest.skip("SIMD must divide number of input channels") - if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: - pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") - if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: - pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") - if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1): - pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim") - if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)): - pytest.skip("Not all combinations for stride > k edge case supported in default mode") - if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)): - pytest.skip("Parallel window requires SIMD=C for non-depthwise case") - - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) - ofm_dim = [ofm_dim_h, ofm_dim_w] - - x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - m=m, - parallel_window=parallel_window, - stride=stride, - dilation=dilation, - idt=idt, - dw=dw, - ) - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) - model = model.transform(PrepareRTLSim()) - - # prepare input data - input_dict = prepare_inputs(x) - # execute model - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - stride=stride, - dilation=dilation, - idt=idt, - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - - if dw == 0: - assert (y_produced == y_expected).all() - else: - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) - y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) - assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index f5a06316e2..6c0712b7b0 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, Advanced Micro Devices, Inc. +# Copyright (c) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,7 +48,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.core.onnx_exec import execute_onnx from finn.core.rtlsim_exec import rtlsim_exec @@ -60,6 +60,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pyverilate_get_liveness_threshold_cycles @@ -248,10 +249,11 @@ def test_fpgadataflow_conv_dynamic(cfg): # convert to hardware and prepare simulation model = largest_model.transform(LowerConvsToMatMul()) - model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) - model = model.transform(to_hls.InferVectorVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferVectorVectorActivation()) model = model.transform(absorb.AbsorbConsecutiveTransposes()) + model = model.transform(SpecializeLayers()) parent_model = model.transform(CreateDataflowPartition()) sdp_inst = getCustomOp(parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]) model = ModelWrapper(sdp_inst.get_nodeattr("model")) @@ -267,8 +269,10 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1) getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16]) getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16]) - comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation") - comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation") + comp_nodes = model.get_nodes_by_op_type("MVAU_hls") + comp_nodes += model.get_nodes_by_op_type("MVAU_rtl") + comp_nodes += model.get_nodes_by_op_type("VVAU_hls") + comp_nodes += model.get_nodes_by_op_type("VVAU_rtl") for comp_node in comp_nodes: if depthwise: getCustomOp(comp_node).set_nodeattr("PE", 4) @@ -277,6 +281,7 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(comp_node).set_nodeattr("PE", 4) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -404,7 +409,7 @@ def make_single_slidingwindow_modelwrapper( ) SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator_rtl", + "ConvolutionInputGenerator", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -518,9 +523,11 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( dw=dw, ) + model = model.transform(SpecializeLayers()) # Simulate using stitched-ip-rtlsim so we can use existing infrastructure # that supports hook functions to re-program configuration before rtlsim model = model.transform(InsertFIFO(True)) # required for proper simulation + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) @@ -547,7 +554,7 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( configs = [("s_axilite_0_", config)] # Also update FIFO nodes and corresponding tensors - fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[0] + fifo_node = model.get_nodes_by_op_type("StreamingFIFO_rtl")[0] fifo_inst = getCustomOp(fifo_node) shape = fifo_inst.get_nodeattr("folded_shape") shape[1] = ifm_dim_h @@ -555,7 +562,7 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( fifo_inst.set_nodeattr("folded_shape", shape) update_tensor_dim(model, fifo_node.input[0], ifm_dim) - fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[1] + fifo_node = model.get_nodes_by_op_type("StreamingFIFO_rtl")[1] fifo_inst = getCustomOp(fifo_node) shape = fifo_inst.get_nodeattr("folded_shape") shape[1] = ofm_dim_h diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py index 6c25be0f85..f1fc989066 100644 --- a/tests/fpgadataflow/test_fpgadataflow_deconv.py +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, Advanced Micro Devices, Inc. +# Copyright (c) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,7 +41,7 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import ( +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( InferConvInpGen, InferQuantizedMatrixVectorActivation, ) @@ -49,10 +49,14 @@ from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import ( InferPixelPaddingDeconv, ) +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pynq_part_map test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -146,14 +150,6 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, idim_h, idim_w = idim stride_h, stride_w = stride - if idim_h == idim_w and stride_h == stride_w: - convinpgen_rtl = False - else: - convinpgen_rtl = True - - if exec_mode == "cppsim" and convinpgen_rtl: - pytest.skip("ConvolutionInputGenerator_rtl has no cppsim, skipping cppsim") - ref_model = set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding) odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 @@ -162,23 +158,30 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, idim_h, idim_w]) input_dict = {"inp": input_tensor} + y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"] + model = ref_model.transform(InferPixelPaddingDeconv()) - model = model.transform(InferConvInpGen(use_rtl_variant=convinpgen_rtl)) + model = model.transform(InferConvInpGen()) model = model.transform(InferQuantizedMatrixVectorActivation()) model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) + model = model.transform(MinimizeAccumulatorWidth()) + for n in model.graph.node: - if n.op_type == "ConvolutionInputGenerator" and not convinpgen_rtl: + if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", simd) - elif n.op_type == "MatrixVectorActivation": + elif n.op_type.startswith("MVAU"): mvau_node = getCustomOp(n) mvau_node.set_nodeattr("PE", pe) mvau_node.set_nodeattr("SIMD", simd) expected_oshape = (1, ofm_ch, odim_h, odim_w) - y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"] # cppsim if exec_mode == "cppsim": @@ -188,6 +191,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, # rtlsim else: + model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) @@ -198,7 +202,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("FMPadding_Pixel")[0] + node = model.get_nodes_by_op_type("FMPadding_Pixel_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py index 8a3c1fe682..25717a4152 100644 --- a/tests/fpgadataflow/test_fpgadataflow_downsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py @@ -39,7 +39,7 @@ from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.basic import gen_finn_dt_tensor -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -48,6 +48,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False): @@ -126,8 +127,11 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0")) idict = {"in0": inp} y_expected = execute_onnx(model, idict)["out0"] - model = model.transform(to_hls.InferConvInpGen()) + model = model.transform(to_hw.InferConvInpGen()) assert len(model.get_nodes_by_op_type("DownSampler")) == 1 + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all() + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) @@ -143,7 +147,7 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("DownSampler")[0] + node = model.get_nodes_by_op_type("DownSampler_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index 27bab93fb6..62b9265466 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,9 +47,10 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): +def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl, impl_style): shape = [1, idim, idim, ch] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) out_names = [] @@ -59,7 +61,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): out_vi.append(helper.make_tensor_value_info(outp_name, TensorProto.FLOAT, shape)) dupstrm_node = helper.make_node( - "DuplicateStreams_Batch", + "DuplicateStreams", ["inp"], out_names, domain="finn.custom_op.fpgadataflow", @@ -69,6 +71,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): PE=pe, inputDataType=idt.name, numInputVectors=[1, idim, idim], + preferred_impl_style=impl_style, ) graph = helper.make_graph(nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi) @@ -99,9 +102,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("n_dupl", [2, 3]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl_style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): +def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode, impl_style): if fold == -1: pe = 1 else: @@ -111,7 +116,19 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) - model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl) + model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl, impl_style) + + # prepare input data and execute + input_dict = prepare_inputs(x, idt) + + # check behavior of hw abstraction layer + output_dict = oxe.execute_onnx(model, input_dict) + expected_y = x + for i in range(n_dupl): + y = output_dict["outp%d" % i] + assert (y == expected_y).all(), "HW layer execution failed" + + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -126,17 +143,14 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) output_dict = oxe.execute_onnx(model, input_dict) - expected_y = x for i in range(n_dupl): y = output_dict["outp%d" % i] assert (y == expected_y).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0] + node = model.get_nodes_by_op_type("DuplicateStreams_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index 47332f069b..7152d32a7b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -1,5 +1,5 @@ # Copyright (C) 2020-2022, Xilinx, Inc. -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,20 +36,22 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style, use_rtl_variant): +def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) - if use_rtl_variant: - optype = "StreamingDataWidthConverter_rtl" - else: - optype = "StreamingDataWidthConverter_Batch" + optype = "StreamingDataWidthConverter" DWC_node = helper.make_node( optype, @@ -62,10 +64,6 @@ def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_styl outWidth=outWidth, dataType=str(finn_dtype.name), ) - if not use_rtl_variant: - # add additional attribute - impl_attr = helper.make_attribute("impl_style", impl_style) - DWC_node.attribute.append(impl_attr) graph = helper.make_graph(nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp]) @@ -85,39 +83,86 @@ def prepare_inputs(input_tensor, dt): @pytest.mark.parametrize( "config", [ - ([1, 24], 6, 4, DataType["INT2"], "hls"), - ([1, 24], 4, 6, DataType["INT2"], "hls"), - ([1, 4], 2, 4, DataType["BIPOLAR"], "hls"), - ([1, 2, 8], 2, 4, DataType["BIPOLAR"], "hls"), - ([1, 4], 4, 2, DataType["INT2"], "hls"), - ([1, 2, 8], 4, 4, DataType["INT2"], "hls"), - ([1, 2, 8], 8, 16, DataType["INT2"], "vivado"), + ([1, 24], 6, 4, DataType["INT2"]), + ([1, 24], 4, 6, DataType["INT2"]), + ([1, 4], 2, 4, DataType["BIPOLAR"]), + ([1, 2, 8], 2, 4, DataType["BIPOLAR"]), + ([1, 4], 4, 2, DataType["INT2"]), + ([1, 2, 8], 4, 4, DataType["INT2"]), + ([1, 2, 8], 8, 16, DataType["INT2"]), ], ) -@pytest.mark.parametrize("use_rtl_variant", [0, 1]) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_dwc_rtlsim(config, use_rtl_variant): - shape, inWidth, outWidth, finn_dtype, impl_style = config - - if use_rtl_variant: - iwidth_d = inWidth % outWidth == 0 - owidth_d = outWidth % inWidth == 0 - if not (iwidth_d or owidth_d): - pytest.skip("RTL variant only supports stream widths that are divisible by int ratios") +def test_fpgadataflow_dwc(config, exec_mode): + shape, inWidth, outWidth, finn_dtype = config + + test_fpga_part = "xc7z020clg400-1" + # generate input data + x = gen_finn_dt_tensor(finn_dtype, shape) + input_dict = prepare_inputs(x, finn_dtype) + + model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) + # verify abstraction level execution + y = oxe.execute_onnx(model, input_dict)["outp"] + assert ( + y == x + ).all(), """The output values are not the same as the + input values anymore.""" + assert y.shape == tuple(shape), """The output shape is incorrect.""" + + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(PrepareIP(test_fpga_part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert ( + y == x + ).all(), """The output values are not the same as the + input values anymore.""" + assert y.shape == tuple(shape), """The output shape is incorrect.""" + + +@pytest.mark.parametrize( + "config", + [ + ([1, 24], 6, 4, DataType["INT2"]), + ([1, 24], 4, 6, DataType["INT2"]), + ([1, 4], 2, 4, DataType["BIPOLAR"]), + ([1, 2, 8], 2, 4, DataType["BIPOLAR"]), + ([1, 4], 4, 2, DataType["INT2"]), + ([1, 2, 8], 4, 4, DataType["INT2"]), + ([1, 2, 8], 8, 16, DataType["INT2"]), + ], +) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_dwc_stitched_rtlsim(config): + shape, inWidth, outWidth, finn_dtype = config + test_fpga_part = "xc7z020clg400-1" target_clk_ns = 10.0 # generate input data x = gen_finn_dt_tensor(finn_dtype, shape) input_dict = prepare_inputs(x, finn_dtype) - model = make_single_dwc_modelwrapper( - shape, inWidth, outWidth, finn_dtype, impl_style, use_rtl_variant - ) + model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) + model = model.transform(SpecializeLayers()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, 5)) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py index 6028a9b9f0..fbfcc8e28b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_eltwise.py +++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,7 +39,7 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -47,6 +48,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def build_model(shp, dt0, dt1, do_abs): @@ -105,9 +107,17 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): in1 = gen_finn_dt_tensor(dt1, shp) idict = {"in0": in0, "in1": in1} y_expected = execute_onnx(model, idict)["out0"] - model = model.transform(to_hls.InferStreamingEltwise()) + model = model.transform(to_hw.InferStreamingEltwise()) assert len(model.graph.node) == 1 assert model.graph.node[0].op_type == "StreamingEltwise" + + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all(), exec_mode + " failed" + + model = model.transform(SpecializeLayers()) + + assert len(model.graph.node) == 1 + assert model.graph.node[0].op_type == "StreamingEltwise_hls" getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -124,7 +134,7 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("StreamingEltwise")[0] + node = model.get_nodes_by_op_type("StreamingEltwise_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py index 27417a78e1..1719da1454 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fifo.py +++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,6 +41,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers build_dir = os.environ["FINN_BUILD_DIR"] test_fpga_part = "xc7z020clg400-1" @@ -58,6 +60,7 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype): backend="fpgadataflow", depth=Depth, folded_shape=fld_shape, + normal_shape=Shape, dataType=str(finn_dtype.name), ) @@ -83,7 +86,7 @@ def prepare_inputs(input_tensor, dt): # outWidth @pytest.mark.parametrize("depth", [16]) # finn_dtype -@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"]]) # , DataType["INT2"]]) +@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado @@ -93,6 +96,7 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype): input_dict = prepare_inputs(x, finn_dtype) model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype) + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index c871811c5e..45cc265ac7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,6 +47,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pynq_part_map test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -53,7 +55,7 @@ target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt): +def make_single_fmpadding_modelwrapper(impl_style, idim, padding, num_ch, simd, idt): pad_h = padding[0] + padding[2] pad_w = padding[1] + padding[3] idim_h, idim_w = idim @@ -66,7 +68,7 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch]) FMPadding = helper.make_node( - optype, + "FMPadding", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -77,6 +79,7 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) inputDataType=str(idt.name), numInputVectors=1, SIMD=simd, + preferred_impl_style=impl_style, ) graph = helper.make_graph( @@ -110,8 +113,6 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): - if impl_style == "rtl" and mode == "cppsim": - pytest.skip("rtl implstyle has no cppsim, skipping") if num_ch % simd != 0: pytest.skip(" num_ch % simd != 0, skipping") @@ -125,9 +126,17 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): odim_h = idim_h + pad_h odim_w = idim_w + pad_w - optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style] + y_expected = np.pad(x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant") + expected_oshape = (1, odim_h, odim_w, num_ch) + + model = make_single_fmpadding_modelwrapper(impl_style, idim, pad, num_ch, simd, idt) + + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert y_produced.shape == expected_oshape + assert (y_produced == y_expected).all(), "HW layer execution failed" + + model = model.transform(SpecializeLayers()) - model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) @@ -140,15 +149,13 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): model = model.transform(PrepareRTLSim()) y_produced = oxe.execute_onnx(model, input_dict)["outp"] - expected_oshape = (1, odim_h, odim_w, num_ch) - assert y_produced.shape == expected_oshape - - y_expected = np.pad(x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant") + assert y_produced.shape == expected_oshape assert (y_produced == y_expected).all() if mode == "rtlsim": - node = model.get_nodes_by_op_type(optype)[0] + op_type = "FMPadding_" + impl_style + node = model.get_nodes_by_op_type(op_type)[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py index 1b3d87c11f..9c2802aade 100644 --- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,14 +45,15 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_accpool_modelwrapper(ch, pe, idim, idt): +def make_accpool_modelwrapper(ch, pe, idim, idt, impl_style): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, idim, idim, ch]) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, ch]) accpool_node = helper.make_node( - "GlobalAccPool_Batch", + "GlobalAccPool", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -60,6 +62,7 @@ def make_accpool_modelwrapper(ch, pe, idim, idt): PE=pe, inputDataType=idt.name, numInputVectors=[1, idim, idim], + preferred_impl_style=impl_style, ) graph = helper.make_graph(nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]) @@ -85,9 +88,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("imdim", [7]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl_style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): +def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode, impl_style): if fold == -1: pe = 1 else: @@ -97,7 +102,17 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) - model = make_accpool_modelwrapper(ch, pe, imdim, idt) + # prepare input data and execute + input_dict = prepare_inputs(x, idt) + expected_y = np.sum(x, axis=(1, 2)).flatten() + + model = make_accpool_modelwrapper(ch, pe, imdim, idt, impl_style) + + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert (y == expected_y).all(), "HW layer verification failed" + + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -112,15 +127,12 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) y = oxe.execute_onnx(model, input_dict)["outp"] - expected_y = np.sum(x, axis=(1, 2)).flatten() assert (y == expected_y).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("GlobalAccPool_Batch")[0] + node = model.get_nodes_by_op_type("GlobalAccPool_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 2d85cc98f4..2061601b4a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -61,7 +62,7 @@ ip_stitch_model_dir = os.environ["FINN_BUILD_DIR"] -def create_one_fc_model(mem_mode="const"): +def create_one_fc_model(mem_mode="internal_embedded"): # create a model with a MatrixVectorActivation instance with no activation # the wider range of the full accumulator makes debugging a bit easier wdt = DataType["INT2"] @@ -78,10 +79,10 @@ def create_one_fc_model(mem_mode="const"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["inp", "w0"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -113,7 +114,7 @@ def create_one_fc_model(mem_mode="const"): return model -def create_two_fc_model(mem_mode="decoupled"): +def create_two_fc_model(mem_mode="internal_decoupled"): # create a model with two MatrixVectorActivation instances wdt = DataType["INT2"] idt = DataType["INT32"] @@ -130,10 +131,10 @@ def create_two_fc_model(mem_mode="decoupled"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["inp", "w0"], ["mid"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -149,10 +150,10 @@ def create_two_fc_model(mem_mode="decoupled"): ) fc1 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["mid", "w1"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -194,7 +195,7 @@ def create_two_fc_model(mem_mode="decoupled"): return model -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_gen_model(mem_mode): @@ -208,12 +209,12 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode): model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, 5)) model = model.transform(HLSSynthIP()) - assert model.graph.node[0].op_type == "MatrixVectorActivation" - assert model.graph.node[-1].op_type == "TLastMarker" + assert model.graph.node[0].op_type == "MVAU_hls" + assert model.graph.node[-1].op_type == "TLastMarker_hls" model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode) -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_do_stitch(mem_mode): @@ -231,7 +232,7 @@ def test_fpgadataflow_ipstitch_do_stitch(mem_mode): model.save(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch_%s.onnx" % mem_mode) -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_rtlsim(mem_mode): @@ -280,7 +281,7 @@ def test_fpgadataflow_ipstitch_rtlsim(mem_mode): assert (rtlsim_res == x).all() -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow @@ -335,7 +336,7 @@ def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw): pytest.skip("VITIS_PATH not set") platform = alveo_default_platform[board] fpga_part = alveo_part_map[board] - model = create_two_fc_model("external" if extw else "decoupled") + model = create_two_fc_model("external" if extw else "internal_decoupled") if model.graph.node[0].op_type == "StreamingDataflowPartition": sdp_node = getCustomOp(model.graph.node[0]) assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py index efd093b0b3..98ded66ca7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py +++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,15 +43,16 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.test import soft_verify_topk -def make_labelselect_modelwrapper(labels, pe, k, idt): +def make_labelselect_modelwrapper(labels, pe, k, idt, impl_style): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, labels]) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, k]) + outp = helper.make_tensor_value_info("outp", TensorProto.INT64, [1, k]) labelselect_node = helper.make_node( - "LabelSelect_Batch", + "LabelSelect", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -59,6 +61,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt): PE=pe, K=k, inputDataType=idt.name, + preferred_impl_style=impl_style, ) graph = helper.make_graph( nodes=[labelselect_node], @@ -90,9 +93,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("k", [1, 5]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): +def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style): np.random.seed(0) if fold == -1: pe = 1 @@ -105,8 +110,15 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, labels)) + input_dict = prepare_inputs(x, idt) + + model = make_labelselect_modelwrapper(labels, pe, k, idt, impl_style) + + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert soft_verify_topk(x, y, k), "HW layer execution failed" - model = make_labelselect_modelwrapper(labels, pe, k, idt) + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -121,8 +133,6 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) y = oxe.execute_onnx(model, input_dict)["outp"] assert soft_verify_topk(x, y, k), exec_mode + " failed" diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py index d2861261b6..cb15fa3ae5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_lookup.py +++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py @@ -1,5 +1,5 @@ # Copyright (C) 2021-2022, Xilinx, Inc. -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,13 +44,14 @@ from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferLookupLayer +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferLookupLayer from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN export_onnx_path = "test_lookup.onnx" @@ -121,12 +122,17 @@ def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode): ret = execute_onnx(model, {iname: itensor}) exp_out = np.take(embeddings, itensor, axis=0) assert (exp_out == ret[oname]).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW layer and verify conversion model = model.transform(InferLookupLayer()) assert model.graph.node[0].op_type == "Lookup" assert model.graph.node[0].input[0] == iname assert model.graph.node[0].input[1] == ename assert model.graph.node[0].output[0] == oname + ret_hw = execute_onnx(model, {iname: itensor}) + assert (exp_out == ret_hw[oname]).all() + # call transformation to convert abstraction layer into HLS layer + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "Lookup_hls" if exec_mode == "cppsim": model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) @@ -166,14 +172,10 @@ def test_fpgadataflow_lookup_external(): assert tuple(model.get_tensor_shape(ename)) == eshape assert tuple(model.get_tensor_shape(oname)) == exp_oshape assert (model.get_initializer(ename) == embeddings).all() - # itensor = gen_finn_dt_tensor(idt, ishape).astype(np.int64) - # itensor = np.clip(itensor, 0, num_embeddings - 1) - # ret = execute_onnx(model, {iname: itensor}) - # exp_out = np.take(embeddings, itensor, axis=0) - # assert (exp_out == ret[oname]).all() - # call transformation to convert to HLS and verify conversion model = model.transform(InferLookupLayer()) assert model.graph.node[0].op_type == "Lookup" + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "Lookup_hls" assert model.graph.node[0].input[0] == iname assert model.graph.node[0].input[1] == ename assert model.graph.node[0].output[0] == oname diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index b80ef76a19..2a22f3fc41 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -35,7 +35,12 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.util.basic import ( calculate_signed_dot_prod_range, gen_finn_dt_tensor, @@ -43,15 +48,25 @@ ) import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -90,7 +105,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non actval = 0 no_act = 1 FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -127,16 +142,32 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non return model -def prepare_inputs(input_tensor, idt, wdt): +def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): + matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"]) + graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm]) + + model = qonnx_make_model(graph, producer_name="fclayer-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype( + "ofm", DataType["INT32"] + ) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_initializer("weights", W) + # model.set_tensor_layout("ifm", DataLayout.NHWC) + + return model + + +def prepare_inputs(input_tensor, idt, wdt, inp_name="inp"): if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: # convert bipolar to binary - return {"inp": (input_tensor + 1) / 2} + return {inp_name: (input_tensor + 1) / 2} else: - return {"inp": input_tensor} + return {inp_name: input_tensor} -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) # weight datatype @@ -154,7 +185,7 @@ def prepare_inputs(input_tensor, idt, wdt): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_mvau_hwop(idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -191,10 +222,98 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): else: tdt = DataType["INT32"] model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + # prepare input data + input_dict = prepare_inputs(x, idt, wdt) + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # convert inputs to binary and use xnorpopcountmatmul + y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) + else: + y = np.matmul(x, W) + if T is not None: + # y = multithreshold(y, T) + if act == DataType["BIPOLAR"]: + # binary to bipolar + # y = 2 * y - 1 + y = multithreshold(y, T, 2, -1) + else: + # signed offset + # y += act.min() + y = multithreshold(y, T, 1, act.min()) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim hw-op failed" + + +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled", "external"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2, 1]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [-1, 2, 1]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [16]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_mvau_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + # generate input data + x = gen_finn_dt_tensor(idt, (1, mw)) + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + model = model.transform(GiveUniqueNodeNames()) for node in model.graph.node: # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + # Note: only HLS-based MVAU layers execute CPPsim + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(GiveUniqueNodeNames()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -220,11 +339,11 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), "cppsim hls-op failed" -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) # weight datatype @@ -242,7 +361,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -283,6 +402,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", "hls") # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -303,6 +423,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers("xc7z020clg400-1")) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -312,9 +433,9 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + node = model.get_nodes_by_op_type("MVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -323,10 +444,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert exp_cycles != 0 -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["decoupled"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_decoupled"]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["INT4"]]) +@pytest.mark.parametrize("act", [None, DataType["INT4"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) # input datatype @@ -339,11 +460,15 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [128]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) +# Backend +@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( - mem_mode, idt, wdt, act, nf, sf, mw, mh +def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( + mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style ): + if preferred_impl_style == "rtl" and act is not None: + pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh if sf == -1: @@ -384,6 +509,8 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("resType", "auto") + inst.set_nodeattr("preferred_impl_style", preferred_impl_style) # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -404,6 +531,9 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -413,9 +543,13 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if preferred_impl_style == "hls": + assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + if preferred_impl_style == "hls": + node = model.get_nodes_by_op_type("MVAU_hls")[0] + else: + node = model.get_nodes_by_op_type("MVAU_rtl")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -424,10 +558,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( assert exp_cycles != 0 -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["INT4"]]) +@pytest.mark.parametrize("act", [None, DataType["INT4"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) # input datatype @@ -440,9 +574,15 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("mw", [32]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [32]) +# Backend +@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_mvau_fifocharacterize_rtlsim( + mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style +): + if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None): + pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh if sf == -1: @@ -467,8 +607,13 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("resType", "auto") + inst.set_nodeattr("preferred_impl_style", preferred_impl_style) total_fold = nf * sf exp_total_cycles = total_fold + 10 + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -482,7 +627,101 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh chrc_out = node_inst.get_nodeattr("io_chrc_out") assert chrc_in.shape == (1, 2 * exp_total_cycles) assert chrc_out.shape == (1, 2 * exp_total_cycles) - # first sf cycles should read input continuously - assert (chrc_in[0, :sf] == range(1, sf + 1)).all() + # total number of transactions == 2*SF + assert chrc_in[0, -1] == 2 * sf # all outputs should be produced within the exp n of cycles assert chrc_out[0, exp_total_cycles] == nf + + +@pytest.mark.parametrize("mh", [18]) +@pytest.mark.parametrize("mw", [128]) +@pytest.mark.parametrize("pe", [1, 6, 9, 18]) +@pytest.mark.parametrize("simd", [1, 4, 16, 64, 128]) +@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("clk_ns", [1.66, 4]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): + if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: + pytest.skip( + """Skip test for varying clk for devices other than Versal, + since this variable only affects DSP58s""" + ) + + # Create test input vector (produced by SWG) + ofm_shape = (3, 3) + ofm_h, ofm_w = ofm_shape + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) + W = gen_finn_dt_tensor(wdt, (mw, mh)) + model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # Create MatMul & obtain golden reference output + A = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) + input_dict = prepare_inputs(A, idt, wdt, inp_name="global_in") + + # Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] + + # Create MVAU (HLS) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + + # Apply convert-to-rtl step + model = model.transform(SpecializeLayers(part)) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "MVAU_rtl_0": { + "PE": pe, + "SIMD": simd, + "resType": "dsp", + }, + } + model = model.transform(ApplyConfig(folding_config)) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + # make sure the changed datatypes are propagated through the network + model = model.transform(InferDataTypes()) + + # Run CPPsim + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + output_mvau_hls = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + output_matmul == output_mvau_hls + ).all(), "Output of ONNX model not matching output of node-by-node CPPsim!" + + # Run node-by-node RTLsim + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + output_matmul == output_mvau_rtl + ).all(), "Output of ONNX model not matching output of node-by-node RTLsim!" + + # Run stitched-ip RTLsim + model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(part, clk_ns)) + + model.set_metadata_prop("rtlsim_so", "") + model.set_metadata_prop("exec_mode", "rtlsim") + output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] + + assert ( + output_matmul == output_mvau_rtl_stitch + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index 2ff7dd8b32..1bc2d9d59e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -38,6 +38,7 @@ res_estimation, res_estimation_complete, ) +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def check_two_dict_for_equality(dict1, dict2): @@ -68,7 +69,7 @@ def test_res_estimate(): node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -95,10 +96,11 @@ def test_res_estimate(): model.set_tensor_datatype("outp", odt) model.set_tensor_datatype("weights", wdt) + model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) prod_resource_estimation = model.analysis(res_estimation) expect_resource_estimation = { - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 317, @@ -115,7 +117,7 @@ def test_res_estimate(): prod_resource_estimation = model.analysis(res_estimation_complete) expect_resource_estimation = { - "MatrixVectorActivation_0": [ + "MVAU_hls_0": [ { "BRAM_18K": 0, "BRAM_efficiency": 1, diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py index 67a40d96f3..0df7181a60 100644 --- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,12 +41,13 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferStreamingMaxPool +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferStreamingMaxPool from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode): @@ -92,7 +94,7 @@ def prepare_inputs(input_tensor): # input dimension @pytest.mark.parametrize("ifm_dim", [4, 10]) # input channels -@pytest.mark.parametrize("ifm_ch", [1, 3]) # 1,3 +@pytest.mark.parametrize("ifm_ch", [1, 3]) # pe @pytest.mark.parametrize("pe", [1, 3]) # ceil mode @@ -138,10 +140,16 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil model = golden.transform(InferStreamingMaxPool()) model = model.transform(InferShapes()) - assert model.graph.node[0].op_type == "StreamingMaxPool_Batch" + assert model.graph.node[0].op_type == "StreamingMaxPool" + + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) # Ensure PE value is set - streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0] + streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0] getCustomOp(streamingmaxpool_node).set_nodeattr("PE", pe) if exec_mode == "cppsim": @@ -162,7 +170,7 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0] + node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0] # inst = getCustomOp(node) # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 2b7bc28a10..a6e7e41596 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -51,12 +51,37 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 -def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs): +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC(FINN) to NCHW(Standard) +def layout_FINN2NCHW(data): + return np.transpose(data, (0, 3, 1, 2)) + + +# Convert from NCHW(Standard) to NHWC(FINN) +def layout_NCHW2FINN(data): + return np.transpose(data, (0, 2, 3, 1)) + + +def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs): NumChannels = T.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) @@ -65,20 +90,19 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_i node_inp_list = ["inp", "thresh"] Thresholding_node = helper.make_node( - "Thresholding_Batch", + "Thresholding", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", NumChannels=NumChannels, - PE=pe, numSteps=T.shape[1], inputDataType=idt.name, weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth outputDataType=odt.name, ActVal=actval, - mem_mode=mem_mode, numInputVectors=n_inp_vecs, + preferred_impl_style=impl_style, ) graph = helper.make_graph( nodes=[Thresholding_node], @@ -109,32 +133,82 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_i # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # memory mode -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): +def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem_mode): + # the mem_mode parameter can only be used for the hls thresholding + # so the test will only be executed once for impl_style=rtl and once skipped + # when the mem_mode is varied. Otherwise, the same test configuration would always + # run twice. + if impl_style == "rtl" and mem_mode == "internal_decoupled": + pytest.skip( + "Skip, because test is identical to impl_style=rtl and mem_mode=internal_embedded" + ) if nf == -1: nf = ich pe = ich // nf n_inp_vecs = [1, 2, 2] assert ich % pe == 0 - # generate input data + # generate input data, data layout is NHWC for FINN x = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich])) odt = act n_steps = act.get_num_possible_values() - 1 - T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) - # provide non-decreasing thresholds - T = np.sort(T, axis=1) + + # Generate random, non-decreasing thresholds + thresholds = generate_random_threshold_values(idt, ich, n_steps) + + thresholds = sort_thresholds_increasing(thresholds) if odt == DataType["BIPOLAR"]: actval = 0 else: actval = odt.min() - model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + # Build DUT + model = make_single_thresholding_modelwrapper( + impl_style, thresholds, idt, odt, actval, n_inp_vecs + ) + + # Expected Reference output + # multithreshold util fxn wants NCHW input, not NHWC + x_nchw = layout_FINN2NCHW(x) + y = multithreshold(x_nchw, thresholds) + + # convert back to NHWC for comparison to hw outputs + y = layout_NCHW2FINN(y) + if act == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + + # package input data as dictionary + input_dict = {"inp": x} + + # execute DUT + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) + # Make sure that SpecializeLayers did not default to HLS implementation unexpectedly + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + if impl_style == "hls": + inst.set_nodeattr("mem_mode", mem_mode) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -149,60 +223,49 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): else: raise Exception("Unknown exec_mode") - # package input data as dictionary - input_dict = {"inp": x} - - # multithreshold util fxn wants NCHW input, not NHWC - y = multithreshold(np.transpose(x, (0, 3, 1, 2)), T) - # convert back to NHWC for comparison to hw outputs - y = np.transpose(y, (0, 2, 3, 1)) - if act == DataType["BIPOLAR"]: - # binary to bipolar - y = 2 * y - 1 - else: - # signed offset - y += act.min() - - oshape = model.get_tensor_shape("outp") - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "Thresholding_Batch_0" in hls_synt_res_est - - node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + if impl_style == "hls": + hls_synt_res_est = model.analysis(hls_synth_res_estimation) + assert model.graph.node[0].name in hls_synt_res_est + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +# configuration (ch, pe) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_single_layer(): +def test_runtime_thresholds_read(impl_style, cfg): + """Read back threshold weights during runtime + + 1. Create random initial weights T + 2. Execute model + 3. Read back weights via AXI + 4. Compare with initial weights T + """ + ch = cfg[0] + pe = cfg[1] n_inp_vecs = [1, 2, 2] - mem_mode = "decoupled" + hls_mem_mode = "internal_decoupled" act = DataType["INT4"] idt = DataType["INT16"] - nf = 8 - ich = 16 - pe = ich // nf - assert ich % pe == 0 - - # generate input data - in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich])) - odt = act n_steps = act.get_num_possible_values() - 1 - T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) + np.random.seed(2) + T = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) # provide non-decreasing thresholds T = np.sort(T, axis=1) @@ -211,17 +274,29 @@ def test_runtime_thresholds_single_layer(): else: actval = odt.min() - model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) - op_inst = getCustomOp(model.graph.node[0]) + model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs) + model = model.transform(SpecializeLayers()) + + # Make sure that specialize layer did not default to HLS implementation + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + + node = model.get_nodes_by_op_type(f"Thresholding_{impl_style}")[0] + op_inst = getCustomOp(node) + op_inst.set_nodeattr("PE", pe) + if impl_style == "hls": + op_inst.set_nodeattr("mem_mode", hls_mem_mode) op_inst.set_nodeattr("runtime_writeable_weights", 1) - op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") - with open("old_weights.dat", "r") as f: + + dat_fname = f"old_weights_{cfg}.dat" + op_inst.make_weight_file(T, "decoupled_runtime", dat_fname) + with open(dat_fname, "r") as f: old_weight_stream = f.read().strip() - os.remove("old_weights.dat") + os.remove(dat_fname) old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) @@ -231,7 +306,10 @@ def test_runtime_thresholds_single_layer(): # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) + # generate input data + in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch])) in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) + exec_ctx = {"inp": in_tensor} extracted_weight_stream = [] @@ -242,51 +320,140 @@ def read_weights(sim): addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) + + # Validate the AXI Read weights assert extracted_weight_stream == old_weight_stream - # only use second batch element in output; first will be invalid due to - # old weights (see above) - y = exec_ctx["outp"][1] + + y = exec_ctx["outp"][0] # multithreshold util fxn wants NCHW input, not NHWC expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T) # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] - # expected = multithreshold(in_tensor, T)[1] if act == DataType["BIPOLAR"]: - # binary to bipolar + # binary to bipolarW expected = 2 * expected - 1 else: # signed offset expected += act.min() + + # Validate the output is as expected assert (y == expected).all() - new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) + +@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +# configuration (ch, pe) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_runtime_thresholds_write(impl_style, cfg): + """Write threshold weights during runtime + + 1. Create random initial weights T_init + 2. Create model with initial weights + 3. Create new set of weights T_write + 4. Write T_write using AXI bus + 5. Read back using AXI bus to T_read + 6. Compare T_write and T_read + 7. Validate outputs with expected vectors + """ + ch = cfg[0] + pe = cfg[1] + + n_inp_vecs = [1, 2, 2] + hls_mem_mode = "internal_decoupled" + act = DataType["INT4"] + idt = DataType["INT16"] + + odt = act + n_steps = act.get_num_possible_values() - 1 + np.random.seed(2) + T_init = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T_init = np.sort(T_init, axis=1) + + if odt == DataType["BIPOLAR"]: + actval = 0 + else: + actval = odt.min() + + model = make_single_thresholding_modelwrapper(impl_style, T_init, idt, odt, actval, n_inp_vecs) + model = model.transform(SpecializeLayers()) + + # Validate that specialize layer did not default to HLS implementation + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + + op_inst = getCustomOp(model.graph.node[0]) + op_inst.set_nodeattr("PE", pe) + if impl_style == "hls": + op_inst.set_nodeattr("mem_mode", hls_mem_mode) + op_inst.set_nodeattr("runtime_writeable_weights", 1) + + # Make new weights for runtime write + np.random.seed(4) + T_write = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) # provide non-decreasing thresholds - new_weights = np.sort(T, axis=1) - op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat") - with open("new_weights.dat", "r") as f: - new_weight_stream = f.read().strip() - os.remove("new_weights.dat") - new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n")) - new_weight_stream = list(new_weight_stream) + T_write = np.sort(T_write, axis=1) + + dat_fname = f"T_write_{cfg}.dat" # distinguish fname per paramter for distributed testing + op_inst.make_weight_file(T_write, "decoupled_runtime", dat_fname) + with open(dat_fname, "r") as f: + T_write_stream = f.read().strip() + os.remove(dat_fname) + + T_write_stream = map(lambda x: int(x, 16), T_write_stream.split("\n")) + T_write_stream = list(T_write_stream) + + # need to create stitched IP for runtime weight testing + model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model = model.transform(PrepareRTLSim()) + model.set_metadata_prop("exec_mode", "rtlsim") + # add two copies of the input tensor as the first one is just used to + # "flush out" the pipeline (as mvau already starts receiving old weights while + # we read/write new ones and reads seem to cause a disturbance too) + # generate input data + in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch])) + in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) + + exec_ctx_write = {"inp": in_tensor} def write_weights(sim): addr = 0 - for nw in new_weight_stream: + for nw in T_write_stream: axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 - rtlsim_exec(model, exec_ctx, pre_hook=write_weights) - y = exec_ctx["outp"][1] + T_read_stream = [] + + def read_weights(sim): + addr = 0 + for i in range(len(T_write_stream)): + T_read_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) + addr += 4 + + rtlsim_exec(model, exec_ctx_write, pre_hook=write_weights, post_hook=read_weights) + + y = exec_ctx_write["outp"][1] + + assert T_read_stream == T_write_stream + # multithreshold util fxn wants NCHW input, not NHWC - expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), new_weights) + expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T_write) # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] + if act == DataType["BIPOLAR"]: - # binary to bipolar + # binary to bipolarW expected = 2 * expected - 1 else: # signed offset expected += act.min() + + # Validate the output is as expected assert (y == expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py index 70d81c7d31..b0da767eaa 100644 --- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py @@ -48,12 +48,13 @@ import finn.core.onnx_exec as oxe import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferUpsample +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferUpsample from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.basic import make_build_dir @@ -84,29 +85,6 @@ def apply(self, model): _to_chan_first_args = (0, 3, 1, 2) -class TransposeUpsampleIO(Transformation): - """ - Converts the inputs outputs for all Upsample and Resize nodes - from NCHW to NHWC. - """ - - def apply(self, model): - graph = model.graph - for n in graph.node: - if n.op_type == "Upsample" or n.op_type == "Resize": - # Set input shape - inp = n.input[0] - NCHW_shape = model.get_tensor_shape(inp) - NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] - model.set_tensor_shape(inp, NHWC_shape) - # Set output shape - out = n.output[0] - NCHW_shape = model.get_tensor_shape(out) - NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] - model.set_tensor_shape(out, NHWC_shape) - return model, False - - class PyTorchTestModel(nn.Module): def __init__(self, upscale_factor=2): super(PyTorchTestModel, self).__init__() @@ -173,7 +151,6 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d # Prep model for execution model = ModelWrapper(export_path) - # model = model.transform(TransposeUpsampleIO()) model = model.transform(MakeInputChannelsLast()) model = model.transform(InferDataLayouts()) model = model.transform(absorb.AbsorbTransposeIntoResize()) @@ -186,8 +163,18 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d # Check that all nodes are UpsampleNearestNeighbour_Batch nodes for n in model.get_finn_nodes(): - node_check = n.op_type == "UpsampleNearestNeighbour_Batch" - assert node_check, "All nodes should be UpsampleNearestNeighbour_Batch nodes." + node_check = n.op_type == "UpsampleNearestNeighbour" + assert node_check, "All nodes should be UpsampleNearestNeighbour nodes." + + test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) + input_dict = {model.graph.input[0].name: test_in_transposed} + + # Run sim + output_dict = oxe.execute_onnx(model, input_dict, True) + test_result = output_dict[model.graph.output[0].name] + output_matches = np.isclose(golden_result, test_result, atol=atol).all() + + model = model.transform(SpecializeLayers()) # Prep sim if exec_mode == "cppsim": @@ -204,8 +191,6 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d raise Exception("Unknown exec_mode") # Run sim - test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) - input_dict = {model.graph.input[0].name: test_in_transposed} output_dict = oxe.execute_onnx(model, input_dict, True) test_result = output_dict[model.graph.output[0].name] output_matches = np.isclose(golden_result, test_result, atol=atol).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index 4208169c0b..236176faa6 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,21 +32,40 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels): @@ -90,7 +109,7 @@ def _make_single_vvau_modelwrapper( odt, T=None, tdt=None, - mem_mode="const", + mem_mode="internal_embedded", ): in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH] out_shape = [ @@ -116,7 +135,7 @@ def _make_single_vvau_modelwrapper( actval = 0 VVAU_node = helper.make_node( - "VectorVectorActivation", + "VVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -157,10 +176,6 @@ def _make_single_vvau_modelwrapper( return model -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - # input datatype @pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]]) # weight datatype @@ -180,7 +195,7 @@ def prepare_inputs(input_tensor): # Number of input and output channels @pytest.mark.parametrize("channels", [3, 6]) # memory mode -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @@ -232,6 +247,12 @@ def test_fpgadataflow_vvau( model = _make_single_vvau_modelwrapper( W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + input_dict = prepare_inputs(x_vvau) + y_hwop = oxe.execute_onnx(model, input_dict)["global_out"] + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) @@ -246,8 +267,6 @@ def test_fpgadataflow_vvau( else: raise Exception("Unknown exec_mode in test_fpgadataflow_vvau") - input_dict = prepare_inputs(x_vvau) - # Calculate output if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: # Simulate XNOR-popcount matrix multiplication, see @@ -269,15 +288,183 @@ def test_fpgadataflow_vvau( # signed offset y_expected += act.min() - y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)["outp"] + y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)["global_out"] - assert (y_produced == y_expected).all(), "incorrect result" + assert (y_hwop == y_expected).all(), "VVAU HW-op mismatches with golden output!" + assert (y_produced == y_expected).all(), "VVAU specialized-op mismatches with golden output!" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0 + + +def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): + kernel_size, in_feature_dim, in_chn = conv_config + stride = 1 + pad = 0 + + out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) + group = out_chn = in_chn + + conv_param_shape = [out_chn, 1, kernel_size, kernel_size] + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, out_chn, out_feature_dim, out_feature_dim] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = group + conv_config["kernel_shape"] = [kernel_size, kernel_size] + conv_config["pads"] = [pad, pad, pad, pad] + conv_config["strides"] = [stride, stride] + + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape) + weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)] + + modelproto = qonnx_make_model( + helper.make_graph( + name="conv_test", + inputs=[ifm], + outputs=[ofm], + value_info=weights, + nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape)) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +def prepare_inputs(input_tensor): + return {"global_in": input_tensor} + + +# kernel size (square) +@pytest.mark.parametrize("kernel_size", [3]) +# IFM size (square) +@pytest.mark.parametrize("in_feature_dim", [5]) +# input channels +@pytest.mark.parametrize("in_chn", [4]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["INT6"]]) +# targeted board +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +# pe +@pytest.mark.parametrize("pe", [1, 2, 4]) +# simd +@pytest.mark.parametrize("simd", [1, 3, 9]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, part, pe, simd): + # Create depthwise-separable convolution + conv_config = (kernel_size, in_feature_dim, in_chn) + model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # Obtain golden reference output + golden_in = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) + input_dict = prepare_inputs(golden_in) + golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)["global_out"] + + # Convert to HLS custom-op first + model = model.transform(LowerConvsToMatMul()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + output_vvau_hw = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)[ + "global_out" + ] + assert ( + golden_out == output_vvau_hw + ).all(), "Output of ONNX model not matching output of HW-ops!" + + # Obtain second reference from HLS-based VVAU layer + model = model.transform(SpecializeLayers(part)) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "ConvolutionInputGenerator_rtl_0": { + "SIMD": pe, + "parallel_window": 1, + }, + "VVAU_rtl_0": { + "PE": pe, + "SIMD": simd, + "resType": "dsp", + }, + } + model = model.transform(ApplyConfig(folding_config)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + # make sure the changed datatypes are propagated through the network + model = model.transform(InferDataTypes()) + + # Run CPPsim + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + output_vvau_cppsim = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + golden_out == output_vvau_cppsim + ).all(), "Output of ONNX model not matching output of node-by-node CPPsim!" + + # Run node-by-node RTLsim + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_vvau_rtlsim = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)[ + "global_out" + ] + + assert ( + golden_out == output_vvau_rtlsim + ).all(), "Output of ONNX model not matching output of specialized HW-ops!" + + # Stitched-IP RTLsim + model = model.transform(CreateDataflowPartition()) + partition_model_path = getCustomOp( + model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + ).get_nodeattr("model") + partitioned_model = ModelWrapper(partition_model_path) + # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism + partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5)) + partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) + partitioned_model = partitioned_model.transform(HLSSynthIP()) + partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) + # set top-level prop for stitched-ip rtlsim and launch + partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + # transpose input since we're now simulating HW layers (NCHW --> NHWC) + input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1)) + output_vvau_stitched = oxe.execute_onnx( + partitioned_model, input_dict, return_full_exec_context=True + )["global_out"] + # tranpose hardware-generated outputs NHWC -> NCHW to be comparable + output_vvau_stitched = output_vvau_stitched.transpose(0, 3, 1, 2) + + assert ( + golden_out == output_vvau_stitched + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" diff --git a/tests/fpgadataflow/test_minimize_bit_width.py b/tests/fpgadataflow/test_minimize_bit_width.py index 0e704230e7..4b26e7ac00 100644 --- a/tests/fpgadataflow/test_minimize_bit_width.py +++ b/tests/fpgadataflow/test_minimize_bit_width.py @@ -36,8 +36,8 @@ from qonnx.util.basic import gen_finn_dt_tensor, roundup_to_integer_multiple from typing import Optional, Union -from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation -from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU from finn.transformation.fpgadataflow.minimize_accumulator_width import ( MinimizeAccumulatorWidth, ) @@ -52,7 +52,7 @@ def make_unit_test_model(wdt: DataType, idt: DataType, tdt: Optional[DataType] = inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 32, 32, 288]) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 32, 32, 64]) layer1 = helper.make_node( - "VectorVectorActivation", + "VVAU", ["inp", "params0", "thresh0"] if tdt is not None else ["inp", "params0"], ["hid"], domain="finn.custom_op.fpgadataflow", @@ -68,7 +68,7 @@ def make_unit_test_model(wdt: DataType, idt: DataType, tdt: Optional[DataType] = noActivation=0 if tdt is not None else 1, ) layer2 = helper.make_node( - "MatrixVectorActivation", + "MVAU", ["hid", "params1", "thresh1"] if tdt is not None else ["hid", "params1"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -170,7 +170,7 @@ def test_minimize_weight_bit_width(wdt: DataType, rww: bool): # If runtime-writeable weights, specify as a node attribute for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): inst.set_nodeattr("runtime_writeable_weights", int(rww)) # Apply the optimization @@ -179,14 +179,14 @@ def test_minimize_weight_bit_width(wdt: DataType, rww: bool): # Iterate through each node to make sure it functioned properly for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): cur_wdt = DataType[inst.get_nodeattr("weightDataType")] exp_wdt = def_wdt if rww else wdt assert cur_wdt.bitwidth() == exp_wdt.bitwidth(), "Mismatched data types" def calculate_accumulator_bit_width( - inst: Union[MatrixVectorActivation, VectorVectorActivation], model: ModelWrapper + inst: Union[MVAU, VVAU], model: ModelWrapper ) -> Union[DataType, IntType]: """Calculate the accumulator bit width using the closed-form expressions derived in `Quantized Neural Networks for Low-Precision Accumulation @@ -206,9 +206,9 @@ def phi(x: float) -> float: if inst.get_nodeattr("binaryXnorMode"): weights = 2 * weights - 1 # modify the weights based on if the node is a VVAU or MVAU - if isinstance(inst, MatrixVectorActivation): + if isinstance(inst, MVAU): K = inst.get_nodeattr("MW") # matrix_width = num_inputs - elif isinstance(inst, VectorVectorActivation): + elif isinstance(inst, VVAU): k_h, k_w = inst.get_nodeattr("Kernel") K = k_h * k_w # size of kernels = num_inputs fm = inst.get_nodeattr("Channels") @@ -275,7 +275,7 @@ def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, # If runtime-writeable weights, specify as a node attribute for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): inst.set_nodeattr("runtime_writeable_weights", int(rww)) cur_adt = DataType[inst.get_nodeattr("accDataType")] assert cur_adt.bitwidth() == def_adt.bitwidth(), "Default data type is incorrect" @@ -286,7 +286,7 @@ def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, # Iterate through each node to make sure it functioned properly for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): cur_adt = DataType[inst.get_nodeattr("accDataType")] cur_odt = DataType[inst.get_nodeattr("outputDataType")] # Calculating expected accumulator bit width using a closed-form expression diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 9b2f418776..3e7822a077 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,6 +42,7 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.create import hls_random_mlp_maker test_fpga_part = "xczu3eg-sbva484-1-e" @@ -68,9 +70,10 @@ def test_runtime_weights_single_layer(): } layer_spec_list = [layer_spec] model = hls_random_mlp_maker(layer_spec_list) - fcl = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + model = model.transform(SpecializeLayers()) + fcl = model.get_nodes_by_op_type("MVAU_hls")[0] op_inst = getCustomOp(fcl) - op_inst.set_nodeattr("mem_mode", "decoupled") + op_inst.set_nodeattr("mem_mode", "internal_decoupled") op_inst.set_nodeattr("runtime_writeable_weights", 1) old_weights = model.get_initializer(fcl.input[1]) op_inst.make_weight_file(old_weights, "decoupled_runtime", "old_weights.dat") @@ -80,6 +83,7 @@ def test_runtime_weights_single_layer(): old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py index ce9f4b12ed..19e459c222 100644 --- a/tests/fpgadataflow/test_set_folding.py +++ b/tests/fpgadataflow/test_set_folding.py @@ -64,10 +64,10 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): simd = 1 FCLayer_nodes += [ helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", [tensors[i].name, "weights_" + str(i), "thresh_" + str(i)], [tensors[i + 1].name], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=ch, MH=ch, diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py index 653e1e7896..d192755d06 100644 --- a/tests/fpgadataflow/test_split_large_fifos.py +++ b/tests/fpgadataflow/test_split_large_fifos.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -55,7 +55,7 @@ def get_folding_cfg(depth=65536): cfg = dict() cfg["Defaults"] = dict() for i in range(4): - key = "StreamingFIFO_" + str(i) + key = "StreamingFIFO_rtl_" + str(i) cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"} return cfg @@ -86,7 +86,6 @@ def test_split_large_fifos(depth, force_python_rtlsim): build_cfg.DataflowOutputType.STITCHED_IP, build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, ], - default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, ) build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: @@ -98,7 +97,7 @@ def test_split_large_fifos(depth, force_python_rtlsim): ) model = ModelWrapper(tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx") # exclude final FIFO node (output FIFO, not part of test) - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")[:-1] + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl")[:-1] golden_cfg = get_fifo_split_configs(depth, 256, 32768) for i, fifo_node in enumerate(fifo_nodes): inst = getCustomOp(fifo_node) diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py index 25bf890271..fc9d98d24f 100644 --- a/tests/transformation/test_infer_data_layouts_cnv.py +++ b/tests/transformation/test_infer_data_layouts_cnv.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.cleanup import cleanup as qonnx_cleanup -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline @@ -56,6 +57,7 @@ @pytest.mark.transform +@pytest.mark.xfail def test_infer_data_layouts_cnv(): cnv = get_test_model_trained("CNV", 1, 1) export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv) @@ -100,10 +102,10 @@ def test_infer_data_layouts_cnv(): model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataLayouts()) @@ -114,9 +116,9 @@ def test_infer_data_layouts_cnv(): # since the concept of channels changes with lowering... but it is # conceptually close to NHWC since the innermost dim gets multiplied assert model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC - assert model.get_tensor_layout("MatrixVectorActivation_3_out0") == DataLayout.NHWC + assert model.get_tensor_layout("MVAU_3_out0") == DataLayout.NHWC assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC - assert model.get_tensor_layout("MatrixVectorActivation_6_out0") == DataLayout.NC + assert model.get_tensor_layout("MVAU_6_out0") == DataLayout.NC assert model.get_tensor_layout("global_out") == DataLayout.NC os.remove(export_onnx_path_cnv) diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index 3649d6709e..c8f80a8e1b 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -50,6 +50,7 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/time_per_step.json") assert os.path.isfile(output_dir + "/auto_folding_config.json") assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml") assert os.path.isfile(output_dir + "/driver/driver.py") assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json")