From 27d4ee23bff9122ca8d85fd3d0dbd401b98bef4c Mon Sep 17 00:00:00 2001 From: Abhishek Tiwari Date: Fri, 1 Apr 2022 02:56:01 -0700 Subject: [PATCH 1/7] add ap_float tutorial --- .../Features/ac_types/ap_float/CMakeLists.txt | 20 + .../Features/ac_types/ap_float/License.txt | 23 + .../Features/ac_types/ap_float/README.md | 475 ++++++++++++++++ .../Features/ac_types/ap_float/ap_float.sln | 25 + .../ac_types/ap_float/ap_float.vcxproj | 164 ++++++ .../Features/ac_types/ap_float/sample.json | 61 ++ .../ac_types/ap_float/src/CMakeLists.txt | 80 +++ .../ac_types/ap_float/src/ap_float.cpp | 528 ++++++++++++++++++ .../Features/ac_types/ap_float/src/util.hpp | 16 + 9 files changed, 1392 insertions(+) create mode 100755 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/CMakeLists.txt create mode 100755 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/License.txt create mode 100755 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/README.md create mode 100755 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.sln create mode 100755 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.vcxproj create mode 100755 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/sample.json create mode 100755 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/CMakeLists.txt create mode 100644 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/ap_float.cpp create mode 100644 DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/util.hpp diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/CMakeLists.txt new file mode 100755 index 0000000000..38cebd3197 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/CMakeLists.txt @@ -0,0 +1,20 @@ +if(UNIX) + # Direct CMake to use dpcpp rather than the default C++ compiler/linker + set(CMAKE_CXX_COMPILER dpcpp) +else() # Windows + # Force CMake to use dpcpp rather than the default C++ compiler/linker + # (needed on Windows only) + include (CMakeForceCompiler) + CMAKE_FORCE_CXX_COMPILER (dpcpp IntelDPCPP) + include (Platform/Windows-Clang) +endif() + +cmake_minimum_required (VERSION 3.4) + +project(APFloat CXX) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/License.txt new file mode 100755 index 0000000000..7c8b8a36c6 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/License.txt @@ -0,0 +1,23 @@ +Copyright Intel Corporation + +SPDX-License-Identifier: MIT +https://opensource.org/licenses/MIT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/README.md new file mode 100755 index 0000000000..616c244dea --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/README.md @@ -0,0 +1,475 @@ +# Using the Algorithmic C Fixed Point data type 'ap_float' + +This FPGA tutorial demonstrates how to use the Algorithmic C (AC) data type `ap_float` and some best practices. + +***Documentation***: The [DPC++ FPGA Code Samples Guide](https://software.intel.com/content/www/us/en/develop/articles/explore-dpcpp-through-intel-fpga-code-samples.html) helps you to navigate the samples and build your knowledge of DPC++ for FPGA.
+The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) is the reference manual for targeting FPGAs through DPC++.
+The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04/20.04, RHEL*/CentOS* 8, SUSE* 15; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA
Intel® FPGA Programmable Acceleration Card (PAC) D5005 (with Intel Stratix® 10 SX)
Intel® FPGA 3rd party / custom platforms with oneAPI support
*__Note__: Intel® FPGA PAC hardware is only compatible with Ubuntu 18.04* +| Software | Intel® oneAPI DPC++ Compiler
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Including and using the `ap_float` type
Using `ap_float` type to trade off mathematical accuracy for lesser resource utilization
Using various `ap_float` rounding modes and their effect on accuracy and resource utilization
Using the `ap_float `math functions for better quality of results +| Time to complete | 1 hour + +## Purpose + +This FPGA tutorial shows how to use the `ap_float` type with some simple examples and recommended best practices. + +This data-type can be used in place of native floating point types to generate area efficient and optimized designs for the FPGA. For example, operations which do not utilize all of the bits the native types or desings which do not require all of the range and precision of native types are good candidates for replacement with the `ap_float` type. + +This tutorial will present the following: +1. How to include the `ap_float` type and an overview of common `ap_float` use cases. +2. A Polynomial Sine Approximation example which illustrates how to trade off mathematical accuracy for lesser FPGA resource utilization. +3. Rounding Mode and native type to `ap_float` type conversion examples which describe various `ap_float` rounding modes and their effect on accuracy and FPGA resource utilization. +4. A Quadratic Equation Solver example which show cases explicit `ap_float` math functions and how they can be used to replace mathematical operators like `*, /, +` and `-` for better quality of results. + +## Simple Code Example + +An `ap_float` number can be defined as follows: + +```cpp +ihc::ap_float a; +``` +Here `EW` specifies the exponent width and `MW` specifies the mantissa width of the number. Optionally, another template parameter can be specified to set the rounding mode. For more details please refer to the section titled `Variable-Precision Integer and Floating-Point Support` in the Intel® oneAPI DPC++ FPGA Optimization Guide. + +To use this type in your code, you must include the following header: + +```cpp +#include +``` + +To use `ap_float` math functions, you must include the following header: + +```cpp +#include +``` + +Additionally, you must use the flag `-qactypes` (Linux) / `/Qactypes` (Windows) in order to ensure that the headers are correctly included and that the compiler links against the necessary libraries for emulation support. Specify the flag to `dpcpp` if you are invoking `dpcpp` on the command line. The `CMake` file provided with this tutorial will do so automatically. + +You can easily convert your existing designs that use native floating-point types to use `ap_float`: simply switch the original type. For math functions, `ap_float` has the "ihc_" prefix, you can simply switch your math functions accordingly, e.g. `sin(x)` should be changed to `ihc_sin(x)` for `ap_float`. + +After the migration, you can use the area report to examine the area improvement of your design. In general, the line structure of the area report does not change. For example, instead of seeing a `X bit floating-point multiply` on the old design, the source line for the changed design would show `fpga.vpfp.mul`. + +You should confirm that the area used for the operation has indeed decreased from a Quartus compile. You should also make sure that the result of your design still meets your accuracy expectations through simulation. + +## Overview of Common Use Cases for `ap_float` + +You should consider migrating to `ap_float` types when you have precision requirements that differ from native `float` and `double` types, including both the range (number of exponent bits) and precision (number of mantissa bits) metrics. + +Double precision operations cannot be placed into a single hardened DSP block like single-precision operations, so double precision operations are significantly more area intensive and use more hardware resources. Moreover, `float` only has 23 bits of mantissa while `double` has 52, this could be an overkill for applications that only seek a sweet spot in between. + +Additionally, the built in subnormal support with native `double` type is area intensive and being able to turn subnormal support off can be great for reducing area utilization if the application does not consider very small subnormal numbers. + +Finally, the various rounding modes offered along with the `ap_float` type can help trade-off mathematical accuracy for FPGA resource utilization. + +## Trading Off Mathematical Accuracy for Better Resource Utilization + +The kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat` implement a simple polynomial approximation of the sine function. + +The former uses `double` type to do so and the latter uses an `ap_float<11,44, Rnd>`. The `Rnd` rounding mode rounds towards zero. These two kernels will illustrate how to trade off accuracy for lesser FPGA resource utilization. + +See the section `Examining the Reports` to go over the differences in resource utilization between these kernels. See the section `Example of Output` to see the difference in accuracy of results produced by these kernels. + +Note how the kernel function within `RunSineApproximationKernel()` has been written once and the individual kernels are only differentiated by their input/output data types: `ApproximateSineWithDouble` uses `double` data type and `ApproximateSineWithAPFLoat` uses `ap_float` data type. + +```cpp +// Approximate with native double type +RunSineApproximationKernel(q, input, + double_result); +... +constexpr auto Rnd = ihc::fp_config::FP_Round::RZERO; +using ap_float_double = ihc::ap_float<11, 44, Rnd>; + +RunSineApproximationKernel( + q, ap_float_input, ap_float_result); +``` + +This code-reuse is because `ap_float` is designed to fully blend in with native C++ types for syntax and semantics. + +## Conversion Between Native Types and `ap_float` + +In normal floating-point FPGA applications, floating-point literals are represented as compile-time constants and implemented as tie-offs (wires that directly connects to Gnd/Vcc) in RTL. This allows the construction of a constant to use no hardware resources in the FPGA flow. + +However, `ap_float` types that have non-standard exponent and mantissa widths cannot be trivially converted from C++ native `float` or `double` literals. As a result, the construction of an `ap_float` type may sometimes require FPGA logic resources to round the native floating-point constant to the specified `ap_float`. This is called 'intermediate conversion'. + +It is important to understand when the intermediate conversions can occur. Conversion does not only happen when you are explicitly casting numbers: it can also happen when you perform arithmetic operations using `ap_float` types with different precisions. Intermediate conversions are necessary because the operation needs to unify the types of the operands by promoting the less "dominant" types (types that have lower representable range). This is demonstrated by the kernel code in the function `TestConversionKernelA`. + +### Converting Native Numbers to `ap_float` Numbers with Minimal FPGA Hardware Resources + +There are a few ways to generate compile-time `ap_float` constants that do not require any hardware implementation: + + 1. Initializing `ap_float<8,23>` from `float` or `ap_float<11,52>` from `double` is just a direct bitwise copy (wires in RTL), so if the input `float`/`double` is a compile-time constant, the constructed `ap_float` is also a compile-time constant. You may want to extend these two types instead of the native `float` and `double` type if you want to use `ap_float` specific floating-point arithmetic controls (for example, the explicit binary operation presented in the section titled `ap_float_explicit_arithmetic`). + + 2. Converting from a constant to another `ap_float` that has rounding mode `FP_Round::ZERO` also results in a compile time constant. This rounding mode is also respected in a binary operation when promotion rounding is required. This is demonstrated by the kernel code in the function `TestConversionKernelB()`. + + 3. The `convert_to` method of an `ap_float` returns itself rounded to a different type, it accepts a rounding mode as either accurate and area-intensive `RNE` mode (rounds to nearest, tie breaks to even) or inaccurate and non area-intensive `RZERO` (truncate towards zero) mode. When using `RZERO`, the compiler will also be able to convert a constant at compile time. This conversion bypasses the original rounding mode of the `ap_float` type. It is demonstrated by the code in `ConversionKernelC`. + +The kernel code in this tutorial contains comments that describe which operations result in generation of explicit cast operations and which do not. + +Note: + 1. When assigning the result of the `convert_to` function to another `ap_float`, if the left hand side of the assignment has different exponent or mantissa widths than the ones specified in the `convert_to` function on the right hand side, another conversion can occur. + + 2. If your code performs computations on constant/literal native floating-point values, the compiler can sometimes combine them at compile time and save area. This is a compiler optimization technique called 'constant folding' or 'constant propagation'. Please note that this optimization does not work for `ap_float` even when the operands are constant. You should compute your constant arithmetics in native types or pre-compute them by hand. + +## Using Explicit `ap_float` Math Functions in Place of Mathematical Operators + +In C++ applications, the basic binary operations have little expressiveness. On the contrary, FPGAs implement these operations using configurable logic, so you can improve your design's performance by fine-tuning the floating-point operations since they are usually area and latency intensive. + +The kernel code in the function `TestSpecializedQuadraticEqnSolver()` demonstrates how to use the explicit versions of `ap_float` binary operators to perform floating-point arithmetic operations based on your need. + +You can fine-tune the floating-point arithmetic operations when you are multiplying numbers with different precisions and/or outputting the +result of the multiply with a different precision. + +You may also want to fine-tune arithmetic operations when you are not concerned about the accuracy of the operation, or when you expect your values to easily fall into the subnormal range and you do not wish to flush them to zero when that happens. + +To address these use cases, `ap_float` provides an explicit version of binary operators using template functions. The explicit operators provide 3 more main features in addition to basic binary operators. + +1. Allow inputs and outputs with different precisions in the multiplication. + +2. Tweak the area and accuracy trade off of the binary operations. + + The binary operations have high accuracy by default and produce results that are 0.5 ULP off from the most correct result. Users can override the default to choose an implementation with less area but also less precision (1 ULP). + +3. Turn on/off subnormal support in the binary operations. + + To save area, subnormal support in the binary operators default to auto, this means it would be off unless there is direct hardened DSP support for it. Users can turn it on when the computation is expected to produce values close to 0, with some additional area. + +The general form of explicit operations are as follows: + +For addition, subtraction and division: +Syntax: + +``` + ap_float::add/sub/div(op1, op2) +``` + +Usage: + +* Rounds `op1` and `op2` to the specified `E` (exponent) and `M` (mantissa) widths +* Implements the operation with the provided accuracy and subnormal options. +* Returns the result with type `ap_float` + +For multiplication: +Syntax: + +``` + ap_float::mul(op1, op2) +``` + +Usage: + +* Leaves `op1` and `op2` intact +* Implements the operation with the provided accuracy and subnormal options. +* Returns the result with type `ap_float` + +The accuracy setting is optional and can be one of the `enum`s below: +``` + ihc::fp_config::FP_Accuracy::HIGH + ihc::fp_config::FP_Accuracy::LOW +``` + +The subnormal setting is optional can be one of the `enum`s below: +``` + ihc::fp_config::FP_Subnormal::ON + ihc::fp_config::FP_Subnormal::OFF +``` + +Note: +* Both `enum`s need to be compile time constants. +* You must specify the accuracy setting if you want to specify the subnormal setting. + +After fine-tuning the operations, the overall structure of the area report would remain the same, but for each of the fine-tuned operation, you should see an area reduction on the same line if you have chosen to use the low accuracy variant of the operation, or an area increase if you decide to enable subnormal support on an operation. + +### Code Example +This section of the tutorial corresponds to the kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel`. + +`SimpleQuadraticEqnSolverKernel` demonstrates a design that uses `ap_float` with arithmetic operators to compute the quadratic formula. + +After you have successfully compiled the design, open +"part1_operator.prj/reports/report.html " and open the Area Analysis page. +Make sure you understand the resource utilization from each operation as we +will compare the data to the second part of the tutorial. + +`SpecializedQuadraticEqnSolverKernel` implements the same design but with the explicit `ap_float` math functions instead of the binary operators. Please refer to the comments in the code to understand how each operation has been tweaked. + +See the section `Examining the reports for the Quadratic Equation Solver Kernels` below to know more about what to look for in the reports. + +## Key Concepts +* `ap_float` can be used to improve the quality of results on the FPGA by leveraging various features like arbitrary precision, rounding modes, and explicit math functions. +* Use `ap_float` to reduce the range or precision of the operation as required as opposed to native floating point types which have fixed range and precision. +* Rounding mode `RZERO` produces simpler hardware at the cost of accuracy whereas the default rounding mode `RNE` produces more accurate results and uses more FPGA resources. +* The explicit math functions provided by `ap_float` can be used in place of binary math operators such as `+, -, *` and `/`. The functions provide template parameters for fine tuning the accuracy of the operation and turning subnormal number support on or off. + +## License + +Code samples are licensed under the MIT license. See +[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details. + + +## Building the `ap_float` Tutorial + +### Include Files + +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the type of compute node and whether to run in batch or interactive mode. Compiles to FPGA are only supported on fpga_compile nodes. Executing programs on FPGA hardware is only supported on fpga_runtime nodes of the appropriate type, such as fpga_runtime:arria10 or fpga_runtime:stratix10. Neither compiling nor executing programs on FPGA hardware are supported on the login nodes. For more information, see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/documentation/base-toolkit/](https://devcloud.intel.com/oneapi/documentation/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Install the design in `build` directory from the design directory by running `cmake`: + + ```bash + mkdir build + cd build + ``` + + If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + + ```bash + cmake .. + ``` + + Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command: + + ```bash + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command: + ```bash + cmake .. -DFPGA_BOARD=: + ``` + +2. Compile the design using the generated `Makefile`. The following four build targets are provided that match the recommended development flow: + + * Compile and run for emulation (fast compile time, targets emulates an FPGA device) using: + + ```bash + make fpga_emu + ``` + + * Generate HTML optimization reports using: + + ```bash + make report + ``` + + * Compile and run on FPGA hardware (longer compile time, targets an FPGA device) using: + + ```bash + make fpga + ``` + +3. (Optional) As the above hardware compile may take several hours to complete, FPGA precompiled binaries (compatible with Linux* Ubuntu* 18.04) can be downloaded here. + +### On a Windows* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake -G "NMake Makefiles" .. + ``` + Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command: + + ``` + cmake -G "NMake Makefiles" .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command: + ``` + cmake -G "NMake Makefiles" .. -DFPGA_BOARD=: + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + nmake fpga_emu + ``` + * Generate the optimization report: + ``` + nmake report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + nmake fpga + ``` + +*Note:* The Intel® PAC with Intel Arria® 10 GX FPGA and Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) do not yet support Windows*. Compiling to FPGA hardware on Windows* requires a third-party or custom Board Support Package (BSP) with Windows* support.
+*Note:* If you encounter any issues with long paths when compiling under Windows*, you may have to create your ‘build’ directory in a shorter path, for example c:\samples\build. You can then run cmake from that directory, and provide cmake with the full path to your sample directory. + +### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). +For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports + +Locate the pair of `report.html` files in either: + +* **Report-only compile**: `ap_float_report.prj` +* **FPGA hardware compile**: `ap_float.prj` + +### Examining the Area Reports for the Sine Approximation Kernels + +Navigate to the "Area Analysis-> Area Analysis of System" page. Click on the `Kernel System` line to expand it. + +Observe the difference in resource utilization of the kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat`. + +Expand the lines with the kernel names by clicking on them and expand the sub hierarchies to observe how the `add, mult` and `div` +operations use lesser resources for the `ApproximateSineWithAPFloat` kernel. + +You should observe an area reduction in resource utilization of up to 30% for the binary operations. + +TODO: Is simulation supported for customers yet? +Open the "Throughput Analysis > Verification Statistics" page. You should observe around 15% percent latency reduction when using the `ap_float` type. + +You should also note that simulations indicate that the values computed for both designs are the same. This indicates that our original precision expectation is still satisfied. + +### Examining the Reports for Conversion Kernels + +You can find the usages of conversion in both the area report and the graph viewer. The name of the rounding block is "cast". +Let's look at the reports and analyze each kernel in the tutorial. + +1. Kernel: `ConversionKernelA` + This kernel uses the default rounding mode `RNE` - round to nearest. + + Navigate to the *System Viewer* report (*Views* > *System Viewer*) and on the left pane, click on the cluster under `ConversionKernelA`. You should see the conversion functions mentioned in the comments of the source code as "cast" nodes in the graph. The casts from literal types are eliminated at compile time. + + The 4 cast nodes correspond to the following pieces of code: + + ```cpp + const floatTy z = 4.1; + ``` + + ```cpp + auto res = x * num_accessor[0] + y * num_accessor[0] + z; + ``` + + The comments in the kernel code describe how these 2 lines generate 4 "cast" nodes. + +2. Kernel: `ConversionKernelB` + This kernel uses the simpler rounding mode `RZERO`. + + In the graph for the cluster under `ConversionKernelB`, you will find that it now only contains one "cast" node. This corresponds to the code: + ```cpp + x * num_accessor[0] + ... + ``` + Although `x` and `num_accessor[0]` represent `ap_float`s constructed to use rounding mode `RZERO`, the result of this operation is cast to the higher precision `ap_float` using the default rounding mode `RNE` as the multiplication result is an operand for the next operation which uses higher precision. + + The other cast node is represented by a combination of `shift`, `select`, and `and` operations hence only one node labeled as "cast" is visible in the reports. + + The reduction in the number of cast nodes as compared to `ConversionKernelA` results in reduction of hardware resources used by `ConversionKernelB`. + + Observe the differences in the resource usage of these two kernels by navigating to the *Area Analysis of System* report (*Area Analysis* > *Area Analysis of System*) and looking at the entries under `Kernel System`. + +2. Kernel: `ConversionKernelC` + This kernel shows how to use the `convert_to` function and modify the rounding mode for a specific operation. + + In the graph for the cluster under `Kernel_C`, you will find that it contains two "cast" nodes, corresponding to the conversions: + ```cpp + auto res = (x * num_accessor[0]).convert_to<11, 52, RndN>() + // This conversion is done explicitly + y * num_accessor[0] + // This conversion is done explicitly + z.convert_to<11, 52, RndZ>(); + ``` + +### Examining the Reports for the Quadratic Equation Solver Kernels + +Navigate to the "Area Analysis of System" report under the "Area Analysis" tab and expand the "Kernel System" section. Observe the differences in area utilization for the two kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel`. You should observe a decrease in area of the multiplier in the calculation of `b*b - 4*a*c` at their corresponding line numbers. + +You should also observe a significant area estimation reduction of the divider from changing it to the low accuracy mode in the report. Also note that the area increase of the subtraction as we enable the subnormal support. + +## Running the Sample + +1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + + ```bash + ./ap_float.fpga_emu # Linux + ap_float.fpga_emu.exe # Windows + ``` + +2. Run the sample on the FPGA device + + ```bash + ./ap_float.fpga # Linux + ``` + +### Example of Output + +```txt +Testing basic arithmetic operators to approximate the sine function + +Native Type Result: +Result = 0.707 +Expected = 0.707 +Difference = 1.11e-16 + +Non Native Type Result: +Result = 0.707 +Expected = 0.707 +Difference = 5.12e-14 + +PASSED + +Testing conversions in ap_float +Result = 76.8 +Expected = 76.8 +Difference = 1.81e-06 + +Testing conversions in ap_float with rounding mode RZERO +Result = 76.8 +Expected = 76.8 +Difference = 1.81e-06 + +Testing conversions in ap_float using the convert_to function +Result = 76.8 +Expected = 76.8 +Difference = 1.81e-06 + +PASSED + +Calculating quadratic equation in higher precision +Result = 3.26 and 1.84 +Expected = 3.26 and 1.84 +Difference = 2.19e-07 and 1.24e-07 + +Result = -0.8 and -1.25 +Expected = -0.8 and -1.25 +Difference = 8.48e-08 and 1.32e-07 + +Result = 0 and -0.1 +Expected = 0 and -0.1 +Difference = 0 and 1.49e-09 + +Result = NaN and NaN +Expected = NaN and NaN + +Calculating quadratic equation with the optimized kernel +Result = 3.26 and 1.84 +Expected = 3.26 and 1.84 +Difference = 2.19e-07 and 1.24e-07 + +Result = -0.8 and -1.25 +Expected = -0.8 and -1.25 +Difference = 8.48e-08 and 1.32e-07 + +Result = 0 and -0.1 +Expected = 0 and -0.1 +Difference = 0 and 1.49e-09 + +Result = NaN and NaN +Expected = NaN and NaN + +PASSED +``` + +### Discussion of Results +`ap_float` can be leveraged to improve the design performance and fine tune FPGA resource utilization. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.sln new file mode 100755 index 0000000000..968fd8d9e2 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ap_float", "ap_float.vcxproj", "{73FCAD5C-4C93-4786-B662-A7273C515E22}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Debug|x64.ActiveCfg = Debug|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Debug|x64.Build.0 = Debug|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Release|x64.ActiveCfg = Release|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {DE911CD1-4F98-4391-BD43-B02212357F5E} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.vcxproj new file mode 100755 index 0000000000..44440f48fd --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.vcxproj @@ -0,0 +1,164 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {73fcad5c-4c93-4786-b662-a7273c515e22} + Win32Proj + ap_float + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + /Qactypes + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR /Qactypes %(AdditionalOptions) + $(IntDir)ap_float.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + /Qactypes + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + /Qactypes + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR /Qactypes %(AdditionalOptions) + $(IntDir)ap_float.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + /Qactypes + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/sample.json new file mode 100755 index 0000000000..28754f4338 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/sample.json @@ -0,0 +1,61 @@ +{ + "guid": "4EEE7A9B-7C7A-4A20-B758-ADC4975626FC", + "name": "AP Float", + "categories": ["Toolkit/oneAPI Direct Programming/DPC++ FPGA/Tutorials/Features"], + "description": "An Intel® FPGA tutorial demonstrating how to use the Algorithmic C Arbitrary Precision Floating Point data type (AP Float)", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "dpcpp --version", + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./ap_float.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "dpcpp --version", + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "dpcpp --version", + "cd ../../..", + "mkdir build", + "cd build", + "cmake -G \"NMake Makefiles\" ../Tutorials/Features/ap_float", + "nmake fpga_emu", + "ap_float.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "dpcpp --version", + "cd ../../..", + "mkdir build", + "cd build", + "cmake -G \"NMake Makefiles\" ../Tutorials/Features/ap_float", + "nmake report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/CMakeLists.txt new file mode 100755 index 0000000000..0cf8aa5467 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/CMakeLists.txt @@ -0,0 +1,80 @@ +# To see a Makefile equivalent of this build system: +# https://github.com/oneapi-src/oneAPI-samples/blob/master/DirectProgramming/DPC++/ProjectTemplates/makefile-fpga + +set(SOURCE_FILE ap_float.cpp) +set(TARGET_NAME ap_float) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# FPGA board selection +if(NOT DEFINED FPGA_BOARD) + set(FPGA_BOARD "intel_a10gx_pac:pac_a10") + message(STATUS "FPGA_BOARD was not specified.\ + \nConfiguring the design to run on the default FPGA board ${FPGA_BOARD} (Intel(R) PAC with Intel Arria(R) 10 GX FPGA). \ + \nPlease refer to the README for information on board selection.") +else() + message(STATUS "Configuring the design to run on FPGA board ${FPGA_BOARD}") +endif() + +# These are Windows-specific flags: +# 1. /EHsc This is a Windows-specific flag that enables exception handling in host code +# 2. /Qactypes Include ac_types headers and link against ac_types emulation libraries +if(WIN32) + set(WIN_FLAG "/EHsc") + set(AC_TYPES_FLAG "/Qactypes") +else() + set(AC_TYPES_FLAG "-qactypes") +endif() + +# A DPC++ ahead-of-time (AoT) compile processes the device code in two stages. +# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V). +# 2. The "link" stage invokes the compiler's FPGA backend before linking. +# For this reason, FPGA backend flags must be passed as link flags in CMake. +set(EMULATOR_COMPILE_FLAGS "${WIN_FLAG} -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR -Wall") +set(EMULATOR_LINK_FLAGS "-fintelfpga ${AC_TYPES_FLAG}") +set(HARDWARE_COMPILE_FLAGS "${WIN_FLAG} -fintelfpga ${AC_TYPES_FLAG} -Wall") +set(HARDWARE_LINK_FLAGS "-fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}") +# We do not need to supply the AC_TYPES_FLAG for the 'report' target's linking stage. +set(REPORT_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}") +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation + +############################################################################### +### FPGA Emulator +############################################################################### +# To compile in a single command: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR fpga_compile.cpp -o fpga_compile.fpga_emu +# CMake executes: +# [compile] dpcpp -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR -o fpga_compile.cpp.o -c fpga_compile.cpp +# [link] dpcpp -fintelfpga ${AC_TYPES_FLAG} fpga_compile.cpp.o -o fpga_compile.fpga_emu +add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS}") +set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS "${EMULATOR_LINK_FLAGS}") +add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + +############################################################################### +### Generate Report +############################################################################### +# To compile manually: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= -fsycl-link=early ap_float.cpp -o ap_float_report.a +set(FPGA_EARLY_IMAGE ${TARGET_NAME}_report.a) +# The compile output is not an executable, but an intermediate compilation result unique to DPC++. +add_executable(${FPGA_EARLY_IMAGE} ${SOURCE_FILE}) +add_custom_target(report DEPENDS ${FPGA_EARLY_IMAGE}) +set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") +set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES LINK_FLAGS "${REPORT_LINK_FLAGS} -fsycl-link=early") +# fsycl-link=early stops the compiler after RTL generation, before invoking Quartus® + +############################################################################### +### FPGA Hardware +############################################################################### +# To compile in a single command: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= ap_float.cpp -o ap_float.fpga +# CMake executes: +# [compile] dpcpp -fintelfpga ${AC_TYPES_FLAG} -o ap_float.cpp.o -c ap_float.cpp +# [link] dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= ap_float.cpp.o -o ap_float.fpga +add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) +add_custom_target(fpga DEPENDS ${FPGA_TARGET}) +set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") +set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_TARGET}") +# The -reuse-exe flag enables rapid recompilation of host-only code changes. +# See DPC++FPGA/GettingStarted/fast_recompile for details. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/ap_float.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/ap_float.cpp new file mode 100644 index 0000000000..03fbb33020 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/ap_float.cpp @@ -0,0 +1,528 @@ +// clang-format off +#include +#include +#include +#include +#include // for std::setprecision + +#define _USE_MATH_DEFINES // need to define this for Windows +#include +// clang-format on + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +// Include some helper functions for this tutorial +#include "util.hpp" +using namespace sycl; + +// Forward declare the kernel name in the global scope. +// This is a FPGA best practice that reduces name mangling in the optimization +// reports. +class ApproximateSineWithDouble; +class ApproximateSineWithAPFloat; + +class ConversionKernelA; +class ConversionKernelB; +class ConversionKernelC; + +class SimpleQuadraticEqnSolverKernel; +class SpecializedQuadraticEqnSolverKernel; + +constexpr int kSineApproximateTermsCount = 10; +constexpr double kSineApproximationEpsilon = 1e-13; + +// ap_float< 8,23> has the same number of exponent and mantissa bits as native +// float type +using APFloatType = ihc::ap_float<8, 23>; + +// ap_float<11,52> has the same number of exponent and mantissa bits as native +// double type +using APDoubleType = ihc::ap_float<11, 52>; + +using PairAPDoubleType = std::pair; + +// Now we are changing the rounding mode on APFloatType and APDoubleType +constexpr auto kRoundingModeRZERO = ihc::fp_config::FP_Round::RZERO; + +// ap_float< 8,23> has the same number of exponent and mantissa bits as float +using APFloatTypeB = ihc::ap_float<8, 23, kRoundingModeRZERO>; + +// ap_float<11,52> has the same number of exponent and mantissa bits as double +using APDoubleTypeB = ihc::ap_float<11, 52, kRoundingModeRZERO>; + +constexpr auto kRoundingModeRNE = ihc::fp_config::FP_Round::RNE; + +// -------------------------------------------------------------------------- // +// Polynomial Sine Approximation example +// -------------------------------------------------------------------------- // + +template +void RunSineApproximationKernel(queue &q, const T &input, T &output) { + buffer inp_buffer(&input, 1); + buffer res_buffer(&output, 1); + + q.submit([&](handler &h) { + accessor x{inp_buffer, h, read_only}; + accessor retval{res_buffer, h, write_only, no_init}; + + h.single_task([=] { + T res = 0.0; + T sign = 1.0; + T term = x[0]; + T numerator = x[0]; + T denom = 1.0; + +#pragma unroll + for (int i = 1; i <= kSineApproximateTermsCount; ++i) { + res += term; + sign = -sign; + denom *= 2 * i * (2 * i + 1); + numerator *= x[0] * x[0]; + term = sign * numerator / denom; + } + retval[0] = res; + }); + }); +} + +void TestSineApproximation(queue &q) { + bool passed_native = false, passed_non_native = false; + + std::cout << "Testing basic arithmetic operators to approximate the sine " + "function\n\n"; + + double input = M_PI_4; // pi / 4 + double expected = + M_SQRT1_2; // 1/square_root(2), it is the value of sin(input); + double double_result; + + // Approximate with native double type + RunSineApproximationKernel(q, input, + double_result); + + // Approximate with ap_float type + // We set the rounding mode to RZERO (truncate to zero) because this allows us + // to generate compile-time ap_float constants from double type literals shown + // below, which eliminates the area usage for initialization. + using APDoubleTypeC = ihc::ap_float<11, 44, kRoundingModeRZERO>; + + APDoubleTypeC ap_float_input = (APDoubleTypeC)input; + APDoubleTypeC ap_float_result; + + RunSineApproximationKernel( + q, ap_float_input, ap_float_result); + + double difference_a = std::abs(double_result - expected); + double difference_b = std::abs((double)ap_float_result - expected); + + std::cout << "Native Type Result:\n"; + std::cout << "Result = " << std::setprecision(3) << (double)double_result + << "\n"; + std::cout << "Expected = " << std::setprecision(3) << (double)expected + << "\n"; + std::cout << "Difference = " << std::setprecision(3) << (double)difference_a + << "\n\n"; + + std::cout << "Non Native Type Result:\n"; + std::cout << "Result = " << std::setprecision(3) + << (double)ap_float_result << "\n"; + std::cout << "Expected = " << std::setprecision(3) << (double)expected + << "\n"; + std::cout << "Difference = " << std::setprecision(3) << (double)difference_b + << "\n"; + + passed_native = (difference_a < kSineApproximationEpsilon); + passed_non_native = (difference_b < kSineApproximationEpsilon); + + if (passed_native && passed_non_native) { + std::cout << "\nPASSED\n\n"; + } else { + std::cout << "\nFAILED\n\n"; + } +} + +// -------------------------------------------------------------------------- // +// Rounding Mode and native type to ap_float type conversion examples +// -------------------------------------------------------------------------- // + +// The default rounding mode when converting from other types to APFloatType and +// APDoubleType is RNE (round to nearest) This rounding mode provides better +// accuracy but can be more area intensive than RZERO(truncate to zero) +void TestConversionKernelA(queue &q, const APFloatType &num, + APDoubleType &res) { + buffer inp_buffer(&num, 1); + buffer res_buffer(&res, 1); + + q.submit([&](handler &h) { + accessor num_accessor{inp_buffer, h, read_only}; + accessor res_accessor{res_buffer, h, write_only, no_init}; + + h.single_task([=] { + // This is a direct bitcast: x and y will be compile time constants and + // hence no cast operation will be generated for it. + const APFloatType x = 3.1f; + const APDoubleType y = 4.1; + + // This is not free, construction will result in a cast block in RTL from + // double to float. Constant propagation will not be able to remove this + // block since the rounding logic for RNE is quite complicated + const APFloatType z = 4.1; + + // When mixing types in arithmetic operations, rounding operations are + // needed to promote different types to the same: + // - x and num are of the same type, so no conversion is required + // - y and num are not of the same type, num will be promoted to the more + // dominant APDoubleType type and this will result in generation of a + // cast operation + // - result of x * num will be promoted to APDoubleType before being added + // to y * num, this will generate a cast operation + // - z will be promoted to APDoubleType before being added to the rest, + // requiring another cast operation + auto res = x * num_accessor[0] + y * num_accessor[0] + z; + res_accessor[0] = res; + }); + }); +} + +// The rounding mode when converting from other types to APFloatTypeB and +// APDoubleTypeB is RZERO (truncate towards). This rounding mode is simpler and +// can be constant-propagated +void TestConversionKernelB(queue &q, const APFloatTypeB &num, + APDoubleTypeB &res) { + buffer inp_buffer(&num, 1); + buffer res_buffer(&res, 1); + + q.submit([&](handler &h) { + accessor num_accessor{inp_buffer, h, read_only}; + accessor res_accessor{res_buffer, h, write_only, no_init}; + + h.single_task([=] { + const APFloatTypeB x = 3.1f; + const APDoubleTypeB y = 4.1; + + // Constant propagation will be able to make z a compile-time constant + // with rounding mode RZERO + const APFloatTypeB z = 4.1; + + // - x * num : the result of the multiply is cast (promoted) using RNE, + // resulting in a cast block + // - y * num : num is cast (promoted) using RZERO which doesn't need an + // explicit cast block in hardware + // - z : cast version of z is also a compile time constant so no hardware + // is generated for the conversion + auto res = x * num_accessor[0] + y * num_accessor[0] + z; + res_accessor[0] = res; + }); + }); +} + +// For Kernel C, we are using RNE for the conversion on both types. However, +// sometimes we still want to deploy other modes of conversion, especially for +// constructing and casting constants. +void TestConversionKernelC(queue &q, const APFloatType &num, + APDoubleType &res) { + buffer inp_buffer(&num, 1); + buffer res_buffer(&res, 1); + + q.submit([&](handler &h) { + accessor num_accessor{inp_buffer, h, read_only}; + accessor res_accessor{res_buffer, h, write_only, no_init}; + + h.single_task([=] { + const APFloatType x = 3.1f; + const APDoubleType y = 4.1; + + // y is a compile time constant, so converting y to z using RZERO will + // also produce a compile time constant. + const APFloatType z = y.convert_to<8, 23, kRoundingModeRZERO>(); + + // The convert_to function allows you to convert ap_float of different + // precisions using different modes, but you must make sure the receiving + // type of the convert_to function matches the exponent and mantissa width + // of the convert_to arguments. + auto res = + (x * num_accessor[0]) + .convert_to<11, 52, + kRoundingModeRNE>() + // This conversion generates + // a cast operation + y * num_accessor[0] + // The conversion of num to APDoubleType + // creates a cast operation + z.convert_to<11, 52, + kRoundingModeRZERO>(); // the result of this conversion + // is a compile time constant + res_accessor[0] = res; + }); + }); +} + +template +bool RunSpecifiedConversionKernel(queue &q, + void (*kernel_func)(queue &, const T1 &, + T2 &)) { + constexpr double kConversionKernelEpsilon = 1e-5; + + T1 input = (10.1f); + T2 res; + kernel_func(q, input, res); + + double expected = (3.1 * input) + (4.1 * input) + 4.1; + double difference = (res - expected).abs(); + + std::cout << "Result = " << std::setprecision(3) << (double)res << "\n"; + std::cout << "Expected = " << std::setprecision(3) << (double)expected + << "\n"; + std::cout << "Difference = " << std::setprecision(3) << (double)difference + << "\n\n"; + + return difference < kConversionKernelEpsilon; +} + +void TestAllConversionKernels(queue &q) { + std::cout << "Testing conversions in ap_float\n"; + bool passed_A = RunSpecifiedConversionKernel( + q, TestConversionKernelA); + + std::cout << "Testing conversions in ap_float with rounding mode RZERO\n"; + bool passed_B = RunSpecifiedConversionKernel( + q, TestConversionKernelB); + + std::cout + << "Testing conversions in ap_float using the convert_to function\n"; + bool passed_C = RunSpecifiedConversionKernel( + q, TestConversionKernelC); + + if (passed_A && passed_B && passed_C) { + std::cout << "PASSED\n\n"; + } else { + std::cout << "FAILED\n\n"; + } +} + +// -------------------------------------------------------------------------- // +// Quadratic Equation Solver example +// -------------------------------------------------------------------------- // + +// This kernel computes the two roots from a quadratic equation with +// coefficient a, b, and c, for real numbers only, using the simple mathematical +// operators *, / etc. +void TestSimpleQuadraticEqnSolver(queue &q, const float A, const float B, + const float C, PairAPDoubleType &r) { + APDoubleType root1, root2; + + { + buffer inp1_buffer(&A, 1); + buffer inp2_buffer(&B, 1); + buffer inp3_buffer(&C, 1); + + buffer root1_buffer(&root1, 1); + buffer root2_buffer(&root2, 1); + + q.submit([&](handler &h) { + accessor x{inp1_buffer, h, read_only}; + accessor y{inp2_buffer, h, read_only}; + accessor z{inp3_buffer, h, read_only}; + accessor r1{root1_buffer, h, write_only, no_init}; + accessor r2{root2_buffer, h, write_only, no_init}; + + h.single_task([=] { + APDoubleType a(x[0]), b(y[0]), c(z[0]); + auto rooted = b * b - 4.0 * a * c; + PairAPDoubleType ret; + if (rooted > 0.0 || rooted.abs() < 1e-20) { + if (rooted < 0.0) { + rooted = -rooted; + } + auto root = ihc::ihc_sqrt(rooted); + r1[0] = (-b + root) / (2.0 * a); + r2[0] = (-b - root) / (2.0 * a); + } else { + r1[0] = APDoubleType::nan(); + r2[0] = APDoubleType::nan(); + } + }); + }); + } + + r = std::make_pair(root1, root2); +} + +// SimpleQuadraticEqnSolverKernel was relatively area intensive and there +// are many potential optimization opportunities if we fine tune the arithmetic +// instructions. In SpecializedQuadraticEqnSolverKernel we will use the explicit +// ap_float math functions and customize them to improve our quality of results +void TestSpecializedQuadraticEqnSolver(queue &q, const float A, const float B, + const float C, PairAPDoubleType &r) { + // Accuracy and Subnormal Options must be compile time constants + constexpr auto kAccuracyLow = ihc::fp_config::FP_Accuracy::LOW; + constexpr auto kSubnormalOff = ihc::fp_config::FP_Subnormal::OFF; + constexpr auto kAccuracyHigh = ihc::fp_config::FP_Accuracy::HIGH; + constexpr auto kSubnormalOn = ihc::fp_config::FP_Subnormal::ON; + + APDoubleType root1, root2; + + { + buffer inp1_buffer(&A, 1); + buffer inp2_buffer(&B, 1); + buffer inp3_buffer(&C, 1); + + buffer root1_buffer(&root1, 1); + buffer root2_buffer(&root2, 1); + + q.submit([&](handler &h) { + accessor x{inp1_buffer, h, read_only}; + accessor y{inp2_buffer, h, read_only}; + accessor z{inp3_buffer, h, read_only}; + accessor r1{root1_buffer, h, write_only, no_init}; + accessor r2{root2_buffer, h, write_only, no_init}; + + h.single_task([=] { + // Use a smaller type if possible, single precision vs double + APFloatType a(x[0]), b(y[0]), c(z[0]); + + // By default subnormal number processing is off, but for the purpose of + // demonstration, we also spell it out + auto bsquare = APDoubleType::mul( + b, b); // here we avoid one upcast from float to double + auto fourA = APDoubleType::mul( + APFloatType(4.0f), a); // here we avoid one upcast again + auto fourAC = APDoubleType::mul( + fourA, c); // here we avoid one upcast as well + + // For the subtraction operation, we want to have subnormal number + // processed because the number can be really small we also want to have + // a higher precision since we are dealing with small numbers on which + // we make critical decisions on + auto rooted = + APDoubleType::sub(bsquare, fourAC); + + if (rooted > 0.0 || rooted.abs() < 1e-20) { + if (rooted < 0.0) { + rooted = -rooted; + } + auto root = ihc::ihc_sqrt(rooted); + // divider is expensive, low accuracy would provide a significant area + // gain. The default option for addition and multiplication (high + // accuracy and no subnormal) is OK + r1[0] = APDoubleType::div(-b + root, + 2.0 * a); + r2[0] = APDoubleType::div(-b - root, + 2.0 * a); + + } else { + r1[0] = APDoubleType::nan(); + r2[0] = APDoubleType::nan(); + } + }); + }); + } + + r = std::make_pair(root1, root2); +} + +bool TestQuadraticEqnSolverKernels(queue &q, + void (*func)(queue &, const float, + const float, const float, + PairAPDoubleType &)) { + constexpr double kQuadraticEqnEpsilon = 1e-6; + constexpr size_t kQuadraticTestsCount = 3; + + double testvec[kQuadraticTestsCount][3] = { + {1., -5.1, 6.}, {2., 4.1, 2.}, {1., 0.1, 0.}}; + + DoublePair golden_results[sizeof(testvec)]; + PairAPDoubleType outputs[kQuadraticTestsCount]; + bool passed = true; + + for (int i = 0; i < kQuadraticTestsCount; ++i) { + func(q, testvec[i][0], testvec[i][1], testvec[i][2], outputs[i]); + golden_results[i] = + quadratic_gold(testvec[i][0], testvec[i][1], testvec[i][2]); + + auto diff_root1 = + std::fabs((double)outputs[i].first - golden_results[i].first); + auto diff_root2 = + std::fabs((double)outputs[i].second - golden_results[i].second); + + std::cout << "Result = " << std::setprecision(3) + << (double)outputs[i].first << " and " << std::setprecision(3) + << (double)outputs[i].second << "\n"; + std::cout << "Expected = " << std::setprecision(3) + << (double)golden_results[i].first << " and " + << std::setprecision(3) << (double)golden_results[i].second + << "\n"; + std::cout << "Difference = " << std::setprecision(3) << (double)diff_root1 + << " and " << std::setprecision(3) << (double)diff_root2 << "\n"; + + if (diff_root1 > kQuadraticEqnEpsilon || + diff_root2 > kQuadraticEqnEpsilon) { + passed = false; + std::cout << "failed! difference exceeds kQuadraticEqnEpsilon = " + << kQuadraticEqnEpsilon << "\n"; + } + + std::cout << "\n"; + } + + // test the nan case + PairAPDoubleType nan_pair; + func(q, 1., 2., 4., nan_pair); + std::cout << "Result = " << nan_pair.first << " and " << nan_pair.second + << "\n"; + std::cout << "Expected = NaN and NaN\n"; + if (!(sycl::isnan((double)nan_pair.first) && + sycl::isnan((double)nan_pair.second))) { + passed = false; + std::cout << "failed! first or second is not a nan!" + << "\n"; + } + return passed; +} + +void TestQuadraticEquationSolverKernels(queue &q) { + std::cout << "Calculating quadratic equation in higher precision\n"; + auto test_a = TestQuadraticEqnSolverKernels(q, TestSimpleQuadraticEqnSolver); + std::cout << "\nCalculating quadratic equation with the optimized kernel\n"; + auto test_b = + TestQuadraticEqnSolverKernels(q, TestSpecializedQuadraticEqnSolver); + + if (test_a && test_b) { + std::cout << "\nPASSED\n"; + } else { + std::cout << "\nFAILED\n"; + } +} + +int main() { +#if defined(FPGA_EMULATOR) + ext::intel::fpga_emulator_selector selector; +#else + ext::intel::fpga_selector selector; +#endif + + try { + // Create the SYCL device queue + queue q(selector, dpc_common::exception_handler); + + TestSineApproximation(q); + TestAllConversionKernels(q); + TestQuadraticEquationSolverKernels(q); + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.code().value() == CL_DEVICE_NOT_FOUND) { + std::cerr << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cerr << "Run sys_check in the oneAPI root directory to verify.\n"; + std::cerr << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/util.hpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/util.hpp new file mode 100644 index 0000000000..f720e53112 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/util.hpp @@ -0,0 +1,16 @@ +#include +using DoublePair = std::pair; + +const inline DoublePair quadratic_gold(double a, double b, double c) { + auto rooted = b * b - 4.0 * a * c; + auto rooted_abs = fabs(rooted); + + DoublePair ret; + if (rooted > 0.0 || rooted_abs < 1e-20) { + auto root = sqrt(rooted_abs); + ret = std::make_pair((-b + root) / (2.0 * a), (-b - root) / (2.0 * a)); + } else { + ret = std::make_pair(NAN, NAN); + } + return ret; +} \ No newline at end of file From 83b3bcf7b90c5597e756b33b93b804471fc3f905 Mon Sep 17 00:00:00 2001 From: "Wang, Di5" Date: Fri, 1 Apr 2022 13:42:20 -0700 Subject: [PATCH 2/7] move ap_float under Features --- .../Tutorials/Features/{ac_types => }/ap_float/CMakeLists.txt | 0 .../Tutorials/Features/{ac_types => }/ap_float/License.txt | 0 .../Tutorials/Features/{ac_types => }/ap_float/README.md | 0 .../Tutorials/Features/{ac_types => }/ap_float/ap_float.sln | 0 .../Tutorials/Features/{ac_types => }/ap_float/ap_float.vcxproj | 0 .../Tutorials/Features/{ac_types => }/ap_float/sample.json | 0 .../Tutorials/Features/{ac_types => }/ap_float/src/CMakeLists.txt | 0 .../Tutorials/Features/{ac_types => }/ap_float/src/ap_float.cpp | 0 .../Tutorials/Features/{ac_types => }/ap_float/src/util.hpp | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/CMakeLists.txt (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/License.txt (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/README.md (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/ap_float.sln (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/ap_float.vcxproj (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/sample.json (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/src/CMakeLists.txt (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/src/ap_float.cpp (100%) rename DirectProgramming/DPC++FPGA/Tutorials/Features/{ac_types => }/ap_float/src/util.hpp (100%) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/CMakeLists.txt similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/CMakeLists.txt rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/CMakeLists.txt diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/License.txt similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/License.txt rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/License.txt diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/README.md rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.sln similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.sln rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.sln diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.vcxproj similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/ap_float.vcxproj rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.vcxproj diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/sample.json similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/sample.json rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/sample.json diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/CMakeLists.txt similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/CMakeLists.txt rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/CMakeLists.txt diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/ap_float.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/ap_float.cpp rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/util.hpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/util.hpp similarity index 100% rename from DirectProgramming/DPC++FPGA/Tutorials/Features/ac_types/ap_float/src/util.hpp rename to DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/util.hpp From c1668e9478e1b3b29b9a00552076ba0c7de4e3b1 Mon Sep 17 00:00:00 2001 From: "Wang, Di5" Date: Wed, 6 Apr 2022 06:57:12 -0700 Subject: [PATCH 3/7] Update README and sample code --- .../Tutorials/Features/ap_float/README.md | 86 +++++++++---------- .../Features/ap_float/src/ap_float.cpp | 68 ++++++++++----- 2 files changed, 85 insertions(+), 69 deletions(-) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md index 616c244dea..1db13f80da 100755 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md @@ -1,4 +1,4 @@ -# Using the Algorithmic C Fixed Point data type 'ap_float' +# Using the Algorithmic C Fixed Point Data Type 'ap_float' This FPGA tutorial demonstrates how to use the Algorithmic C (AC) data type `ap_float` and some best practices. @@ -18,13 +18,13 @@ The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programmi This FPGA tutorial shows how to use the `ap_float` type with some simple examples and recommended best practices. -This data-type can be used in place of native floating point types to generate area efficient and optimized designs for the FPGA. For example, operations which do not utilize all of the bits the native types or desings which do not require all of the range and precision of native types are good candidates for replacement with the `ap_float` type. +This data-type can be used in place of native floating point types to generate area efficient and optimized designs for the FPGA. For example, operations which do not utilize all the bits of the native types or designs which do not require all of the range and precision of native types are good candidates for replacement with the `ap_float` type. This tutorial will present the following: 1. How to include the `ap_float` type and an overview of common `ap_float` use cases. 2. A Polynomial Sine Approximation example which illustrates how to trade off mathematical accuracy for lesser FPGA resource utilization. 3. Rounding Mode and native type to `ap_float` type conversion examples which describe various `ap_float` rounding modes and their effect on accuracy and FPGA resource utilization. -4. A Quadratic Equation Solver example which show cases explicit `ap_float` math functions and how they can be used to replace mathematical operators like `*, /, +` and `-` for better quality of results. +4. A Quadratic Equation Solver example which showcases explicit `ap_float` math functions and how they can be used to replace mathematical operators like `*, /, +` and `-` for better quality of results. ## Simple Code Example @@ -33,7 +33,7 @@ An `ap_float` number can be defined as follows: ```cpp ihc::ap_float a; ``` -Here `EW` specifies the exponent width and `MW` specifies the mantissa width of the number. Optionally, another template parameter can be specified to set the rounding mode. For more details please refer to the section titled `Variable-Precision Integer and Floating-Point Support` in the Intel® oneAPI DPC++ FPGA Optimization Guide. +Here `EW` specifies the exponent width and `MW` specifies the mantissa width of the number. Optionally, another template parameter can be specified to set the rounding mode. For more details please refer to the section [*Declare the ap_float Data Type*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide/top/optimize-your-design/resource-use/data-types-and-operations/var-prec-fp-sup/declare-and-use-the-ac-data-types/declare-the-ap-float-data-type.html) in the Intel® oneAPI DPC++ FPGA Optimization Guide. To use this type in your code, you must include the following header: @@ -47,13 +47,13 @@ To use `ap_float` math functions, you must include the following header: #include ``` -Additionally, you must use the flag `-qactypes` (Linux) / `/Qactypes` (Windows) in order to ensure that the headers are correctly included and that the compiler links against the necessary libraries for emulation support. Specify the flag to `dpcpp` if you are invoking `dpcpp` on the command line. The `CMake` file provided with this tutorial will do so automatically. +Additionally, you must pass the flag `-qactypes` (Linux) / `/Qactypes` (Windows) to the `dpcpp` command when compiling your SYCL program in order to ensure that the headers are correctly included. Specify the flag to `dpcpp` if you are invoking `dpcpp` on the command line. The `CMake` file provided with this tutorial will do so automatically. You can easily convert your existing designs that use native floating-point types to use `ap_float`: simply switch the original type. For math functions, `ap_float` has the "ihc_" prefix, you can simply switch your math functions accordingly, e.g. `sin(x)` should be changed to `ihc_sin(x)` for `ap_float`. After the migration, you can use the area report to examine the area improvement of your design. In general, the line structure of the area report does not change. For example, instead of seeing a `X bit floating-point multiply` on the old design, the source line for the changed design would show `fpga.vpfp.mul`. -You should confirm that the area used for the operation has indeed decreased from a Quartus compile. You should also make sure that the result of your design still meets your accuracy expectations through simulation. +You should confirm that the area used for the operation has indeed decreased from a Quartus compile. ## Overview of Common Use Cases for `ap_float` @@ -67,22 +67,23 @@ Finally, the various rounding modes offered along with the `ap_float` type can h ## Trading Off Mathematical Accuracy for Better Resource Utilization -The kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat` implement a simple polynomial approximation of the sine function. +Two kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat`, instantiated from the template function `RunSineApproximationKernel()`, implement a simple polynomial approximation of the sine function with single and double precision respectively. The former uses `double` type to do so and the latter uses an `ap_float<11,44, Rnd>`. The `Rnd` rounding mode rounds towards zero. These two kernels will illustrate how to trade off accuracy for lesser FPGA resource utilization. -See the section `Examining the Reports` to go over the differences in resource utilization between these kernels. See the section `Example of Output` to see the difference in accuracy of results produced by these kernels. +See the section *Examining the Reports* to go over the differences in resource utilization between these kernels. See the section *Example of Output* to see the difference in accuracy of results produced by these kernels. Note how the kernel function within `RunSineApproximationKernel()` has been written once and the individual kernels are only differentiated by their input/output data types: `ApproximateSineWithDouble` uses `double` data type and `ApproximateSineWithAPFLoat` uses `ap_float` data type. ```cpp -// Approximate with native double type +// Approximate sine with native double type RunSineApproximationKernel(q, input, double_result); ... constexpr auto Rnd = ihc::fp_config::FP_Round::RZERO; using ap_float_double = ihc::ap_float<11, 44, Rnd>; +// Approximate sine with `ap_float` type RunSineApproximationKernel( q, ap_float_input, ap_float_result); ``` @@ -101,11 +102,11 @@ It is important to understand when the intermediate conversions can occur. Conve There are a few ways to generate compile-time `ap_float` constants that do not require any hardware implementation: - 1. Initializing `ap_float<8,23>` from `float` or `ap_float<11,52>` from `double` is just a direct bitwise copy (wires in RTL), so if the input `float`/`double` is a compile-time constant, the constructed `ap_float` is also a compile-time constant. You may want to extend these two types instead of the native `float` and `double` type if you want to use `ap_float` specific floating-point arithmetic controls (for example, the explicit binary operation presented in the section titled `ap_float_explicit_arithmetic`). + 1. Initializing `ap_float<8,23>` from `float` or `ap_float<11,52>` from `double` is just a direct bitwise copy (wires in RTL), so if the input `float`/`double` is a compile-time constant, the constructed `ap_float` is also a compile-time constant. You may want to extend these two types instead of the native `float` and `double` type if you want to use `ap_float` specific floating-point arithmetic controls (for example, the explicit binary operation presented in the section titled *ap_float_explicit_arithmetic*). 2. Converting from a constant to another `ap_float` that has rounding mode `FP_Round::ZERO` also results in a compile time constant. This rounding mode is also respected in a binary operation when promotion rounding is required. This is demonstrated by the kernel code in the function `TestConversionKernelB()`. - 3. The `convert_to` method of an `ap_float` returns itself rounded to a different type, it accepts a rounding mode as either accurate and area-intensive `RNE` mode (rounds to nearest, tie breaks to even) or inaccurate and non area-intensive `RZERO` (truncate towards zero) mode. When using `RZERO`, the compiler will also be able to convert a constant at compile time. This conversion bypasses the original rounding mode of the `ap_float` type. It is demonstrated by the code in `ConversionKernelC`. + 3. The `convert_to` method of an `ap_float` returns itself rounded to a different type, it accepts a rounding mode as either accurate and area-intensive `RNE` mode (rounds to nearest, tie breaks to even) or inaccurate and non area-intensive `RZERO` (truncate towards zero) mode. When using `RZERO`, the compiler will also be able to convert a constant at compile time. This conversion bypasses the original rounding mode of the `ap_float` type. It is demonstrated by the code in the function `TestConversionKernelC`. The kernel code in this tutorial contains comments that describe which operations result in generation of explicit cast operations and which do not. @@ -118,32 +119,12 @@ Note: In C++ applications, the basic binary operations have little expressiveness. On the contrary, FPGAs implement these operations using configurable logic, so you can improve your design's performance by fine-tuning the floating-point operations since they are usually area and latency intensive. -The kernel code in the function `TestSpecializedQuadraticEqnSolver()` demonstrates how to use the explicit versions of `ap_float` binary operators to perform floating-point arithmetic operations based on your need. - -You can fine-tune the floating-point arithmetic operations when you are multiplying numbers with different precisions and/or outputting the -result of the multiply with a different precision. - -You may also want to fine-tune arithmetic operations when you are not concerned about the accuracy of the operation, or when you expect your values to easily fall into the subnormal range and you do not wish to flush them to zero when that happens. - -To address these use cases, `ap_float` provides an explicit version of binary operators using template functions. The explicit operators provide 3 more main features in addition to basic binary operators. - -1. Allow inputs and outputs with different precisions in the multiplication. - -2. Tweak the area and accuracy trade off of the binary operations. - - The binary operations have high accuracy by default and produce results that are 0.5 ULP off from the most correct result. Users can override the default to choose an implementation with less area but also less precision (1 ULP). - -3. Turn on/off subnormal support in the binary operations. - - To save area, subnormal support in the binary operators default to auto, this means it would be off unless there is direct hardened DSP support for it. Users can turn it on when the computation is expected to produce values close to 0, with some additional area. - -The general form of explicit operations are as follows: +The general form of explicit operations provided in the `ap_float` math functions are as follows: -For addition, subtraction and division: -Syntax: +For addition, subtraction and division, the syntax is: ``` - ap_float::add/sub/div(op1, op2) + ihc::ap_float::add/sub/div(op1, op2) ``` Usage: @@ -152,11 +133,10 @@ Usage: * Implements the operation with the provided accuracy and subnormal options. * Returns the result with type `ap_float` -For multiplication: -Syntax: +For multiplication, the syntax is: ``` - ap_float::mul(op1, op2) + ihc::ap_float::mul(op1, op2) ``` Usage: @@ -180,22 +160,41 @@ The subnormal setting is optional can be one of the `enum`s below: Note: * Both `enum`s need to be compile time constants. * You must specify the accuracy setting if you want to specify the subnormal setting. + +The kernel code in the function `TestSpecializedQuadraticEqnSolver()` demonstrates how to use the explicit versions of `ap_float` binary operators to perform floating-point arithmetic operations based on your need. + +You can fine-tune the floating-point arithmetic operations when you are multiplying numbers with different precisions and/or outputting the +result of the multiply with a different precision. + +You may also want to fine-tune arithmetic operations when you are not concerned about the accuracy of the operation, or when you expect your values to easily fall into the subnormal range and you do not wish to flush them to zero when that happens. + +To address these use cases, `ap_float` provides an explicit version of binary operators using template functions. The explicit operators provide 3 more main features in addition to basic binary operators. +1. Allow inputs and outputs with different precisions in the multiplication. + +2. Tweak the area and accuracy trade off of the binary operations. + + The binary operations have high accuracy by default and produce results that are 0.5 ULP off from the most correct result. Users can override the default to choose an implementation with less area but also less precision (1 ULP). + +3. Turn on/off subnormal support in the binary operations. + + To save area, subnormal support in the binary operators default to auto, this means it would be off unless there is direct hardened DSP support for it. Users can turn it on when the computation is expected to produce values close to 0, with some additional area. + After fine-tuning the operations, the overall structure of the area report would remain the same, but for each of the fine-tuned operation, you should see an area reduction on the same line if you have chosen to use the low accuracy variant of the operation, or an area increase if you decide to enable subnormal support on an operation. ### Code Example -This section of the tutorial corresponds to the kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel`. +This section of the tutorial corresponds to the functions `TestSimpleQuadraticEqnSolver` and `TestSpecializedQuadraticEqnSolver`, which contain the kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel` respectively. `SimpleQuadraticEqnSolverKernel` demonstrates a design that uses `ap_float` with arithmetic operators to compute the quadratic formula. After you have successfully compiled the design, open -"part1_operator.prj/reports/report.html " and open the Area Analysis page. +"ap_float_report.prj/reports/report.html " and open the Area Analysis page. Make sure you understand the resource utilization from each operation as we will compare the data to the second part of the tutorial. `SpecializedQuadraticEqnSolverKernel` implements the same design but with the explicit `ap_float` math functions instead of the binary operators. Please refer to the comments in the code to understand how each operation has been tweaked. -See the section `Examining the reports for the Quadratic Equation Solver Kernels` below to know more about what to look for in the reports. +See the section *Examining the reports for the Quadratic Equation Solver Kernels* below to know more about what to look for in the reports. ## Key Concepts * `ap_float` can be used to improve the quality of results on the FPGA by leveraging various features like arbitrary precision, rounding modes, and explicit math functions. @@ -329,11 +328,6 @@ operations use lesser resources for the `ApproximateSineWithAPFloat` kernel. You should observe an area reduction in resource utilization of up to 30% for the binary operations. -TODO: Is simulation supported for customers yet? -Open the "Throughput Analysis > Verification Statistics" page. You should observe around 15% percent latency reduction when using the `ap_float` type. - -You should also note that simulations indicate that the values computed for both designs are the same. This indicates that our original precision expectation is still satisfied. - ### Examining the Reports for Conversion Kernels You can find the usages of conversion in both the area report and the graph viewer. The name of the rounding block is "cast". @@ -383,7 +377,7 @@ Let's look at the reports and analyze each kernel in the tutorial. ### Examining the Reports for the Quadratic Equation Solver Kernels -Navigate to the "Area Analysis of System" report under the "Area Analysis" tab and expand the "Kernel System" section. Observe the differences in area utilization for the two kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel`. You should observe a decrease in area of the multiplier in the calculation of `b*b - 4*a*c` at their corresponding line numbers. +Navigate to the "Area Analysis of System" report under the "Area Analysis" tab and expand the *Kernel System* section. Observe the differences in area utilization for the two kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel`. You should observe a decrease in area of the multiplier in the calculation of `b*b - 4*a*c` at their corresponding line numbers. You should also observe a significant area estimation reduction of the divider from changing it to the low accuracy mode in the report. Also note that the area increase of the subtraction as we enable the subnormal support. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp index 03fbb33020..4559b33919 100644 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp @@ -30,7 +30,9 @@ class ConversionKernelC; class SimpleQuadraticEqnSolverKernel; class SpecializedQuadraticEqnSolverKernel; +// The number of terms in the polynomial approximation of the sine function constexpr int kSineApproximateTermsCount = 10; + constexpr double kSineApproximationEpsilon = 1e-13; // ap_float< 8,23> has the same number of exponent and mantissa bits as native @@ -58,8 +60,10 @@ constexpr auto kRoundingModeRNE = ihc::fp_config::FP_Round::RNE; // Polynomial Sine Approximation example // -------------------------------------------------------------------------- // +// The function template to generate sine-approximation kernels with different +// floating data types template -void RunSineApproximationKernel(queue &q, const T &input, T &output) { +void SineApproximationKernel(queue &q, const T &input, T &output) { buffer inp_buffer(&input, 1); buffer res_buffer(&output, 1); @@ -87,7 +91,7 @@ void RunSineApproximationKernel(queue &q, const T &input, T &output) { }); } -void TestSineApproximation(queue &q) { +bool TestSineApproximation(queue &q) { bool passed_native = false, passed_non_native = false; std::cout << "Testing basic arithmetic operators to approximate the sine " @@ -99,8 +103,8 @@ void TestSineApproximation(queue &q) { double double_result; // Approximate with native double type - RunSineApproximationKernel(q, input, - double_result); + SineApproximationKernel(q, input, + double_result); // Approximate with ap_float type // We set the rounding mode to RZERO (truncate to zero) because this allows us @@ -111,7 +115,7 @@ void TestSineApproximation(queue &q) { APDoubleTypeC ap_float_input = (APDoubleTypeC)input; APDoubleTypeC ap_float_result; - RunSineApproximationKernel( + SineApproximationKernel( q, ap_float_input, ap_float_result); double difference_a = std::abs(double_result - expected); @@ -136,10 +140,13 @@ void TestSineApproximation(queue &q) { passed_native = (difference_a < kSineApproximationEpsilon); passed_non_native = (difference_b < kSineApproximationEpsilon); + std::cout << "\nSine Approximation: "; if (passed_native && passed_non_native) { - std::cout << "\nPASSED\n\n"; + std::cout << "PASSED\n\n"; + return true; } else { - std::cout << "\nFAILED\n\n"; + std::cout << "FAILED\n\n"; + return false; } } @@ -160,8 +167,8 @@ void TestConversionKernelA(queue &q, const APFloatType &num, accessor res_accessor{res_buffer, h, write_only, no_init}; h.single_task([=] { - // This is a direct bitcast: x and y will be compile time constants and - // hence no cast operation will be generated for it. + // x and y will be compile time constants and hence no cast operation will + // be generated for it. const APFloatType x = 3.1f; const APDoubleType y = 4.1; @@ -279,7 +286,7 @@ bool RunSpecifiedConversionKernel(queue &q, return difference < kConversionKernelEpsilon; } -void TestAllConversionKernels(queue &q) { +bool TestAllConversionKernels(queue &q) { std::cout << "Testing conversions in ap_float\n"; bool passed_A = RunSpecifiedConversionKernel( q, TestConversionKernelA); @@ -293,10 +300,13 @@ void TestAllConversionKernels(queue &q) { bool passed_C = RunSpecifiedConversionKernel( q, TestConversionKernelC); + std::cout << "Conversion: "; if (passed_A && passed_B && passed_C) { std::cout << "PASSED\n\n"; + return true; } else { std::cout << "FAILED\n\n"; + return false; } } @@ -421,10 +431,10 @@ void TestSpecializedQuadraticEqnSolver(queue &q, const float A, const float B, r = std::make_pair(root1, root2); } -bool TestQuadraticEqnSolverKernels(queue &q, - void (*func)(queue &, const float, - const float, const float, - PairAPDoubleType &)) { +bool RunSpecifiedQuadraticEqnSolverKernel(queue &q, + void (*func)(queue &, const float, + const float, const float, + PairAPDoubleType &)) { constexpr double kQuadraticEqnEpsilon = 1e-6; constexpr size_t kQuadraticTestsCount = 3; @@ -480,17 +490,21 @@ bool TestQuadraticEqnSolverKernels(queue &q, return passed; } -void TestQuadraticEquationSolverKernels(queue &q) { +bool TestQuadraticEquationSolverKernels(queue &q) { std::cout << "Calculating quadratic equation in higher precision\n"; - auto test_a = TestQuadraticEqnSolverKernels(q, TestSimpleQuadraticEqnSolver); + auto test_a = + RunSpecifiedQuadraticEqnSolverKernel(q, TestSimpleQuadraticEqnSolver); std::cout << "\nCalculating quadratic equation with the optimized kernel\n"; - auto test_b = - TestQuadraticEqnSolverKernels(q, TestSpecializedQuadraticEqnSolver); + auto test_b = RunSpecifiedQuadraticEqnSolverKernel( + q, TestSpecializedQuadraticEqnSolver); + std::cout << "\nQuadratic Equation Solving: "; if (test_a && test_b) { - std::cout << "\nPASSED\n"; + std::cout << "PASSED\n"; + return true; } else { - std::cout << "\nFAILED\n"; + std::cout << "FAILED\n"; + return false; } } @@ -501,13 +515,15 @@ int main() { ext::intel::fpga_selector selector; #endif + bool passed = true; + try { // Create the SYCL device queue queue q(selector, dpc_common::exception_handler); - TestSineApproximation(q); - TestAllConversionKernels(q); - TestQuadraticEquationSolverKernels(q); + passed &= TestSineApproximation(q); + passed &= TestAllConversionKernels(q); + passed &= TestQuadraticEquationSolverKernels(q); } catch (sycl::exception const &e) { // Catches exceptions in the host code @@ -524,5 +540,11 @@ int main() { std::terminate(); } + if (passed) { + std::cout << "\nPASSED: all kernel results are correct.\n\n"; + } else { + std::cout << "\nFAILED\n\n"; + } + return 0; } From 4ff861b4ec4d8fe8963b0148bd9b40e80a084645 Mon Sep 17 00:00:00 2001 From: "Wang, Di5" Date: Wed, 13 Apr 2022 10:59:46 -0700 Subject: [PATCH 4/7] small changes on README --- .../Tutorials/Features/ap_float/README.md | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md index 1db13f80da..984d23016c 100755 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md @@ -31,9 +31,11 @@ This tutorial will present the following: An `ap_float` number can be defined as follows: ```cpp -ihc::ap_float a; +ihc::ap_float a; ``` -Here `EW` specifies the exponent width and `MW` specifies the mantissa width of the number. Optionally, another template parameter can be specified to set the rounding mode. For more details please refer to the section [*Declare the ap_float Data Type*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide/top/optimize-your-design/resource-use/data-types-and-operations/var-prec-fp-sup/declare-and-use-the-ac-data-types/declare-the-ap-float-data-type.html) in the Intel® oneAPI DPC++ FPGA Optimization Guide. +which consists of `E+M+1` bits: one sign bit, `E` exponent bits and `M` mantissa bits. For example, `ap_float<8,23>` has the same number of exponent and mantissa bits as native `float`, and `ap_float<11,52>` has the same number of exponent and mantissa bits as native `double`. + +Optionally, another template parameter can be specified to set the rounding mode. For more details please refer to the section [*Declare the ap_float Data Type*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide/top/optimize-your-design/resource-use/data-types-and-operations/var-prec-fp-sup/declare-and-use-the-ac-data-types/declare-the-ap-float-data-type.html) in the Intel® oneAPI DPC++ FPGA Optimization Guide. To use this type in your code, you must include the following header: @@ -102,7 +104,7 @@ It is important to understand when the intermediate conversions can occur. Conve There are a few ways to generate compile-time `ap_float` constants that do not require any hardware implementation: - 1. Initializing `ap_float<8,23>` from `float` or `ap_float<11,52>` from `double` is just a direct bitwise copy (wires in RTL), so if the input `float`/`double` is a compile-time constant, the constructed `ap_float` is also a compile-time constant. You may want to extend these two types instead of the native `float` and `double` type if you want to use `ap_float` specific floating-point arithmetic controls (for example, the explicit binary operation presented in the section titled *ap_float_explicit_arithmetic*). + 1. Initializing `ap_float<8,23>` from `float` or `ap_float<11,52>` from `double` is just a direct bitwise copy (wires in RTL), so if the input `float`/`double` is a compile-time constant, the constructed `ap_float` is also a compile-time constant. You may want to extend these two types instead of the native `float` and `double` type if you want to use `ap_float` specific floating-point arithmetic controls (for example, the explicit binary operation presented in the next section *Using Explicit `ap_float` Math Functions in Place of Mathematical Operators*). 2. Converting from a constant to another `ap_float` that has rounding mode `FP_Round::ZERO` also results in a compile time constant. This rounding mode is also respected in a binary operation when promotion rounding is required. This is demonstrated by the kernel code in the function `TestConversionKernelB()`. @@ -186,15 +188,10 @@ After fine-tuning the operations, the overall structure of the area report would This section of the tutorial corresponds to the functions `TestSimpleQuadraticEqnSolver` and `TestSpecializedQuadraticEqnSolver`, which contain the kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel` respectively. `SimpleQuadraticEqnSolverKernel` demonstrates a design that uses `ap_float` with arithmetic operators to compute the quadratic formula. - -After you have successfully compiled the design, open -"ap_float_report.prj/reports/report.html " and open the Area Analysis page. -Make sure you understand the resource utilization from each operation as we -will compare the data to the second part of the tutorial. `SpecializedQuadraticEqnSolverKernel` implements the same design but with the explicit `ap_float` math functions instead of the binary operators. Please refer to the comments in the code to understand how each operation has been tweaked. -See the section *Examining the reports for the Quadratic Equation Solver Kernels* below to know more about what to look for in the reports. +To compare the resource utilization between arithmetic operators and explicit math functions, see the section *Examining the reports for the Quadratic Equation Solver Kernels* below to know more about what to look for in the reports. ## Key Concepts * `ap_float` can be used to improve the quality of results on the FPGA by leveraging various features like arbitrary precision, rounding modes, and explicit math functions. @@ -319,7 +316,7 @@ Locate the pair of `report.html` files in either: ### Examining the Area Reports for the Sine Approximation Kernels -Navigate to the "Area Analysis-> Area Analysis of System" page. Click on the `Kernel System` line to expand it. +Navigate to the *Area Estimates* page. Click on the *Kernel System* line to expand it. Observe the difference in resource utilization of the kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat`. @@ -361,10 +358,8 @@ Let's look at the reports and analyze each kernel in the tutorial. The other cast node is represented by a combination of `shift`, `select`, and `and` operations hence only one node labeled as "cast" is visible in the reports. - The reduction in the number of cast nodes as compared to `ConversionKernelA` results in reduction of hardware resources used by `ConversionKernelB`. + Similarly, the reduction in the number of cast nodes as compared to `ConversionKernelA` results in reduction of hardware resources used by `ConversionKernelC`. - Observe the differences in the resource usage of these two kernels by navigating to the *Area Analysis of System* report (*Area Analysis* > *Area Analysis of System*) and looking at the entries under `Kernel System`. - 2. Kernel: `ConversionKernelC` This kernel shows how to use the `convert_to` function and modify the rounding mode for a specific operation. @@ -375,9 +370,10 @@ Let's look at the reports and analyze each kernel in the tutorial. z.convert_to<11, 52, RndZ>(); ``` + ### Examining the Reports for the Quadratic Equation Solver Kernels -Navigate to the "Area Analysis of System" report under the "Area Analysis" tab and expand the *Kernel System* section. Observe the differences in area utilization for the two kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel`. You should observe a decrease in area of the multiplier in the calculation of `b*b - 4*a*c` at their corresponding line numbers. +Navigate to the *Area Estimates* report and expand the *Kernel System* section. Observe the differences in area utilization for the two kernels `SimpleQuadraticEqnSolverKernel` and `SpecializedQuadraticEqnSolverKernel`. You should observe a decrease in area of the multiplier in the calculation of `b*b - 4*a*c` at their corresponding line numbers. You should also observe a significant area estimation reduction of the divider from changing it to the low accuracy mode in the report. Also note that the area increase of the subtraction as we enable the subnormal support. From 72387743983afdad1e0df7ec71500ca18d7eb473 Mon Sep 17 00:00:00 2001 From: "Wang, Di5" Date: Wed, 13 Apr 2022 12:05:46 -0700 Subject: [PATCH 5/7] remove ApproximateSine sample --- .../Tutorials/Features/ap_float/README.md | 92 ++++------------ .../Features/ap_float/ap_float.vcxproj | 12 +- .../Features/ap_float/src/ap_float.cpp | 103 ------------------ 3 files changed, 24 insertions(+), 183 deletions(-) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md index 984d23016c..0390b56d78 100755 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md @@ -22,9 +22,8 @@ This data-type can be used in place of native floating point types to generate a This tutorial will present the following: 1. How to include the `ap_float` type and an overview of common `ap_float` use cases. -2. A Polynomial Sine Approximation example which illustrates how to trade off mathematical accuracy for lesser FPGA resource utilization. -3. Rounding Mode and native type to `ap_float` type conversion examples which describe various `ap_float` rounding modes and their effect on accuracy and FPGA resource utilization. -4. A Quadratic Equation Solver example which showcases explicit `ap_float` math functions and how they can be used to replace mathematical operators like `*, /, +` and `-` for better quality of results. +2. Rounding Mode and native type to `ap_float` type conversion examples which describe various `ap_float` rounding modes and their effect on accuracy and FPGA resource utilization. +3. A Quadratic Equation Solver example which showcases explicit `ap_float` math functions and how they can be used to replace mathematical operators like `*, /, +` and `-` for better quality of results. ## Simple Code Example @@ -53,52 +52,21 @@ Additionally, you must pass the flag `-qactypes` (Linux) / `/Qactypes` (Windows) You can easily convert your existing designs that use native floating-point types to use `ap_float`: simply switch the original type. For math functions, `ap_float` has the "ihc_" prefix, you can simply switch your math functions accordingly, e.g. `sin(x)` should be changed to `ihc_sin(x)` for `ap_float`. -After the migration, you can use the area report to examine the area improvement of your design. In general, the line structure of the area report does not change. For example, instead of seeing a `X bit floating-point multiply` on the old design, the source line for the changed design would show `fpga.vpfp.mul`. - -You should confirm that the area used for the operation has indeed decreased from a Quartus compile. - ## Overview of Common Use Cases for `ap_float` You should consider migrating to `ap_float` types when you have precision requirements that differ from native `float` and `double` types, including both the range (number of exponent bits) and precision (number of mantissa bits) metrics. -Double precision operations cannot be placed into a single hardened DSP block like single-precision operations, so double precision operations are significantly more area intensive and use more hardware resources. Moreover, `float` only has 23 bits of mantissa while `double` has 52, this could be an overkill for applications that only seek a sweet spot in between. - Additionally, the built in subnormal support with native `double` type is area intensive and being able to turn subnormal support off can be great for reducing area utilization if the application does not consider very small subnormal numbers. Finally, the various rounding modes offered along with the `ap_float` type can help trade-off mathematical accuracy for FPGA resource utilization. -## Trading Off Mathematical Accuracy for Better Resource Utilization - -Two kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat`, instantiated from the template function `RunSineApproximationKernel()`, implement a simple polynomial approximation of the sine function with single and double precision respectively. - -The former uses `double` type to do so and the latter uses an `ap_float<11,44, Rnd>`. The `Rnd` rounding mode rounds towards zero. These two kernels will illustrate how to trade off accuracy for lesser FPGA resource utilization. - -See the section *Examining the Reports* to go over the differences in resource utilization between these kernels. See the section *Example of Output* to see the difference in accuracy of results produced by these kernels. - -Note how the kernel function within `RunSineApproximationKernel()` has been written once and the individual kernels are only differentiated by their input/output data types: `ApproximateSineWithDouble` uses `double` data type and `ApproximateSineWithAPFLoat` uses `ap_float` data type. - -```cpp -// Approximate sine with native double type -RunSineApproximationKernel(q, input, - double_result); -... -constexpr auto Rnd = ihc::fp_config::FP_Round::RZERO; -using ap_float_double = ihc::ap_float<11, 44, Rnd>; - -// Approximate sine with `ap_float` type -RunSineApproximationKernel( - q, ap_float_input, ap_float_result); -``` - -This code-reuse is because `ap_float` is designed to fully blend in with native C++ types for syntax and semantics. - ## Conversion Between Native Types and `ap_float` In normal floating-point FPGA applications, floating-point literals are represented as compile-time constants and implemented as tie-offs (wires that directly connects to Gnd/Vcc) in RTL. This allows the construction of a constant to use no hardware resources in the FPGA flow. However, `ap_float` types that have non-standard exponent and mantissa widths cannot be trivially converted from C++ native `float` or `double` literals. As a result, the construction of an `ap_float` type may sometimes require FPGA logic resources to round the native floating-point constant to the specified `ap_float`. This is called 'intermediate conversion'. -It is important to understand when the intermediate conversions can occur. Conversion does not only happen when you are explicitly casting numbers: it can also happen when you perform arithmetic operations using `ap_float` types with different precisions. Intermediate conversions are necessary because the operation needs to unify the types of the operands by promoting the less "dominant" types (types that have lower representable range). This is demonstrated by the kernel code in the function `TestConversionKernelA`. +It is important to understand when the intermediate conversions can occur. Conversion does not only happen when you are explicitly casting numbers: it can also happen when you perform arithmetic operations using `ap_float` types with different precisions. Intermediate conversions are necessary because the operation needs to unify the types of the operands by promoting the less "dominant" types (types that have lower representable range). This is demonstrated by the kernel `ConversionKernelA` in the function `TestConversionKernelA`. ### Converting Native Numbers to `ap_float` Numbers with Minimal FPGA Hardware Resources @@ -106,9 +74,9 @@ There are a few ways to generate compile-time `ap_float` constants that do not r 1. Initializing `ap_float<8,23>` from `float` or `ap_float<11,52>` from `double` is just a direct bitwise copy (wires in RTL), so if the input `float`/`double` is a compile-time constant, the constructed `ap_float` is also a compile-time constant. You may want to extend these two types instead of the native `float` and `double` type if you want to use `ap_float` specific floating-point arithmetic controls (for example, the explicit binary operation presented in the next section *Using Explicit `ap_float` Math Functions in Place of Mathematical Operators*). - 2. Converting from a constant to another `ap_float` that has rounding mode `FP_Round::ZERO` also results in a compile time constant. This rounding mode is also respected in a binary operation when promotion rounding is required. This is demonstrated by the kernel code in the function `TestConversionKernelB()`. + 2. Converting from a constant to another `ap_float` that has rounding mode `FP_Round::ZERO` also results in a compile time constant. This rounding mode is also respected in a binary operation when promotion rounding is required. This is demonstrated by the kernel `ConversionKernelB` in the function `TestConversionKernelB()`. - 3. The `convert_to` method of an `ap_float` returns itself rounded to a different type, it accepts a rounding mode as either accurate and area-intensive `RNE` mode (rounds to nearest, tie breaks to even) or inaccurate and non area-intensive `RZERO` (truncate towards zero) mode. When using `RZERO`, the compiler will also be able to convert a constant at compile time. This conversion bypasses the original rounding mode of the `ap_float` type. It is demonstrated by the code in the function `TestConversionKernelC`. + 3. The `convert_to` method of an `ap_float` returns itself rounded to a different type, it accepts a rounding mode as either accurate and area-intensive `RNE` mode (rounds to nearest, tie breaks to even) or inaccurate and non area-intensive `RZERO` (truncate towards zero) mode. When using `RZERO`, the compiler will also be able to convert a constant at compile time. This conversion bypasses the original rounding mode of the `ap_float` type. It is demonstrated by the kernel `ConversionKernelC` in the function `TestConversionKernelC`. The kernel code in this tutorial contains comments that describe which operations result in generation of explicit cast operations and which do not. @@ -195,7 +163,6 @@ To compare the resource utilization between arithmetic operators and explicit ma ## Key Concepts * `ap_float` can be used to improve the quality of results on the FPGA by leveraging various features like arbitrary precision, rounding modes, and explicit math functions. -* Use `ap_float` to reduce the range or precision of the operation as required as opposed to native floating point types which have fixed range and precision. * Rounding mode `RZERO` produces simpler hardware at the cost of accuracy whereas the default rounding mode `RNE` produces more accurate results and uses more FPGA resources. * The explicit math functions provided by `ap_float` can be used in place of binary math operators such as `+, -, *` and `/`. The functions provide template parameters for fine tuning the accuracy of the operation and turning subnormal number support on or off. @@ -314,20 +281,9 @@ Locate the pair of `report.html` files in either: * **Report-only compile**: `ap_float_report.prj` * **FPGA hardware compile**: `ap_float.prj` -### Examining the Area Reports for the Sine Approximation Kernels - -Navigate to the *Area Estimates* page. Click on the *Kernel System* line to expand it. - -Observe the difference in resource utilization of the kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat`. - -Expand the lines with the kernel names by clicking on them and expand the sub hierarchies to observe how the `add, mult` and `div` -operations use lesser resources for the `ApproximateSineWithAPFloat` kernel. - -You should observe an area reduction in resource utilization of up to 30% for the binary operations. - ### Examining the Reports for Conversion Kernels -You can find the usages of conversion in both the area report and the graph viewer. The name of the rounding block is "cast". +You can find the usages of conversion in both the *Area Estimates* and the *System Viewer* report . The name of the rounding block is "cast". Let's look at the reports and analyze each kernel in the tutorial. 1. Kernel: `ConversionKernelA` @@ -350,17 +306,19 @@ Let's look at the reports and analyze each kernel in the tutorial. 2. Kernel: `ConversionKernelB` This kernel uses the simpler rounding mode `RZERO`. - In the graph for the cluster under `ConversionKernelB`, you will find that it now only contains one "cast" node. This corresponds to the code: + In the graph for the cluster under `ConversionKernelB`, you will find that it now only contains one explicit "cast" node. This corresponds to the code: ```cpp x * num_accessor[0] + ... ``` Although `x` and `num_accessor[0]` represent `ap_float`s constructed to use rounding mode `RZERO`, the result of this operation is cast to the higher precision `ap_float` using the default rounding mode `RNE` as the multiplication result is an operand for the next operation which uses higher precision. - The other cast node is represented by a combination of `shift`, `select`, and `and` operations hence only one node labeled as "cast" is visible in the reports. - - Similarly, the reduction in the number of cast nodes as compared to `ConversionKernelA` results in reduction of hardware resources used by `ConversionKernelC`. - -2. Kernel: `ConversionKernelC` + The other cast node is implicit and represented by a combination of `shift`, `select`, and `and` operations. + + The reduction in the number of cast nodes as compared to `ConversionKernelA` results in a reduction of hardware resources used by `ConversionKernelB`. + + Observe the differences in the resource usage of these two kernels by navigating to the *Area Estimates* report and looking at the entries under *Kernel System*. + +3. Kernel: `ConversionKernelC` This kernel shows how to use the `convert_to` function and modify the rounding mode for a specific operation. In the graph for the cluster under `Kernel_C`, you will find that it contains two "cast" nodes, corresponding to the conversions: @@ -370,6 +328,8 @@ Let's look at the reports and analyze each kernel in the tutorial. z.convert_to<11, 52, RndZ>(); ``` + Similarly, the reduction in the number of cast nodes as compared to `ConversionKernelA` results in reduction of hardware resources used by `ConversionKernelC`. + ### Examining the Reports for the Quadratic Equation Solver Kernels @@ -395,20 +355,6 @@ You should also observe a significant area estimation reduction of the divider f ### Example of Output ```txt -Testing basic arithmetic operators to approximate the sine function - -Native Type Result: -Result = 0.707 -Expected = 0.707 -Difference = 1.11e-16 - -Non Native Type Result: -Result = 0.707 -Expected = 0.707 -Difference = 5.12e-14 - -PASSED - Testing conversions in ap_float Result = 76.8 Expected = 76.8 @@ -424,7 +370,7 @@ Result = 76.8 Expected = 76.8 Difference = 1.81e-06 -PASSED +Conversion: PASSED Calculating quadratic equation in higher precision Result = 3.26 and 1.84 @@ -458,7 +404,9 @@ Difference = 0 and 1.49e-09 Result = NaN and NaN Expected = NaN and NaN -PASSED +Quadratic Equation Solving: PASSED + +PASSED: all kernel results are correct. ``` ### Discussion of Results diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.vcxproj index 44440f48fd..e1ed886e0e 100755 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.vcxproj +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/ap_float.vcxproj @@ -27,26 +27,26 @@ Application true - Intel(R) oneAPI DPC++ Compiler + Intel(R) oneAPI DPC++ Compiler 2022 Unicode Application false - Intel(R) oneAPI DPC++ Compiler + Intel(R) oneAPI DPC++ Compiler 2022 true Unicode Application true - Intel(R) oneAPI DPC++ Compiler + Intel(R) oneAPI DPC++ Compiler 2022 Unicode Application false - Intel(R) oneAPI DPC++ Compiler + Intel(R) oneAPI DPC++ Compiler 2022 true Unicode @@ -93,7 +93,6 @@ Console true - /Qactypes @@ -112,7 +111,6 @@ Console true - /Qactypes @@ -132,7 +130,6 @@ true true true - /Qactypes @@ -155,7 +152,6 @@ true true true - /Qactypes diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp index 4559b33919..7513f56ffb 100644 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp @@ -20,9 +20,6 @@ using namespace sycl; // Forward declare the kernel name in the global scope. // This is a FPGA best practice that reduces name mangling in the optimization // reports. -class ApproximateSineWithDouble; -class ApproximateSineWithAPFloat; - class ConversionKernelA; class ConversionKernelB; class ConversionKernelC; @@ -30,11 +27,6 @@ class ConversionKernelC; class SimpleQuadraticEqnSolverKernel; class SpecializedQuadraticEqnSolverKernel; -// The number of terms in the polynomial approximation of the sine function -constexpr int kSineApproximateTermsCount = 10; - -constexpr double kSineApproximationEpsilon = 1e-13; - // ap_float< 8,23> has the same number of exponent and mantissa bits as native // float type using APFloatType = ihc::ap_float<8, 23>; @@ -56,100 +48,6 @@ using APDoubleTypeB = ihc::ap_float<11, 52, kRoundingModeRZERO>; constexpr auto kRoundingModeRNE = ihc::fp_config::FP_Round::RNE; -// -------------------------------------------------------------------------- // -// Polynomial Sine Approximation example -// -------------------------------------------------------------------------- // - -// The function template to generate sine-approximation kernels with different -// floating data types -template -void SineApproximationKernel(queue &q, const T &input, T &output) { - buffer inp_buffer(&input, 1); - buffer res_buffer(&output, 1); - - q.submit([&](handler &h) { - accessor x{inp_buffer, h, read_only}; - accessor retval{res_buffer, h, write_only, no_init}; - - h.single_task([=] { - T res = 0.0; - T sign = 1.0; - T term = x[0]; - T numerator = x[0]; - T denom = 1.0; - -#pragma unroll - for (int i = 1; i <= kSineApproximateTermsCount; ++i) { - res += term; - sign = -sign; - denom *= 2 * i * (2 * i + 1); - numerator *= x[0] * x[0]; - term = sign * numerator / denom; - } - retval[0] = res; - }); - }); -} - -bool TestSineApproximation(queue &q) { - bool passed_native = false, passed_non_native = false; - - std::cout << "Testing basic arithmetic operators to approximate the sine " - "function\n\n"; - - double input = M_PI_4; // pi / 4 - double expected = - M_SQRT1_2; // 1/square_root(2), it is the value of sin(input); - double double_result; - - // Approximate with native double type - SineApproximationKernel(q, input, - double_result); - - // Approximate with ap_float type - // We set the rounding mode to RZERO (truncate to zero) because this allows us - // to generate compile-time ap_float constants from double type literals shown - // below, which eliminates the area usage for initialization. - using APDoubleTypeC = ihc::ap_float<11, 44, kRoundingModeRZERO>; - - APDoubleTypeC ap_float_input = (APDoubleTypeC)input; - APDoubleTypeC ap_float_result; - - SineApproximationKernel( - q, ap_float_input, ap_float_result); - - double difference_a = std::abs(double_result - expected); - double difference_b = std::abs((double)ap_float_result - expected); - - std::cout << "Native Type Result:\n"; - std::cout << "Result = " << std::setprecision(3) << (double)double_result - << "\n"; - std::cout << "Expected = " << std::setprecision(3) << (double)expected - << "\n"; - std::cout << "Difference = " << std::setprecision(3) << (double)difference_a - << "\n\n"; - - std::cout << "Non Native Type Result:\n"; - std::cout << "Result = " << std::setprecision(3) - << (double)ap_float_result << "\n"; - std::cout << "Expected = " << std::setprecision(3) << (double)expected - << "\n"; - std::cout << "Difference = " << std::setprecision(3) << (double)difference_b - << "\n"; - - passed_native = (difference_a < kSineApproximationEpsilon); - passed_non_native = (difference_b < kSineApproximationEpsilon); - - std::cout << "\nSine Approximation: "; - if (passed_native && passed_non_native) { - std::cout << "PASSED\n\n"; - return true; - } else { - std::cout << "FAILED\n\n"; - return false; - } -} - // -------------------------------------------------------------------------- // // Rounding Mode and native type to ap_float type conversion examples // -------------------------------------------------------------------------- // @@ -521,7 +419,6 @@ int main() { // Create the SYCL device queue queue q(selector, dpc_common::exception_handler); - passed &= TestSineApproximation(q); passed &= TestAllConversionKernels(q); passed &= TestQuadraticEquationSolverKernels(q); From dd96c4be512f2a4d984cee90d72f433b75cafcf7 Mon Sep 17 00:00:00 2001 From: "Wang, Di5" Date: Wed, 27 Apr 2022 11:43:25 -0700 Subject: [PATCH 6/7] add ApproximateSine back --- .../Tutorials/Features/ap_float/README.md | 29 +++++ .../Features/ap_float/src/ap_float.cpp | 106 ++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md index 0390b56d78..07f301e6e1 100755 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md @@ -56,10 +56,39 @@ You can easily convert your existing designs that use native floating-point type You should consider migrating to `ap_float` types when you have precision requirements that differ from native `float` and `double` types, including both the range (number of exponent bits) and precision (number of mantissa bits) metrics. +Starting from oneAPI 2021.2 release, Intel® oneAPI DPC++ Compiler enables fast math by default, which allows relatively aggressive floating point math optimizations for `float` and `double`. These optimizations cause results that don't conform with the ANSI standard (as oneAPI 2021.1 release and GCC do), which trade-off precision for performance and area. + +To achieve double precision that adheres to the ANSI conformance, you must pass the flag `-no-fma -fp-model=precise` (Linux) / `/Qfma- /fp:precise` (Windows) to the `dpcpp` command when compiling your SYCL program. However, double precision operations cannot be placed into a single hardened DSP block like single-precision operations, so double precision operations are significantly more area intensive and use more hardware resources. Moreover, `float` only has 23 bits of mantissa while `double` has 52, this could be an overkill for applications that only seek a sweet spot in between. + Additionally, the built in subnormal support with native `double` type is area intensive and being able to turn subnormal support off can be great for reducing area utilization if the application does not consider very small subnormal numbers. Finally, the various rounding modes offered along with the `ap_float` type can help trade-off mathematical accuracy for FPGA resource utilization. +## Trading Off Mathematical Accuracy for Better Resource Utilization + +In this tutorial, the template function `RunSineApproximationKernel()` instantiates two kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat`, which implement a simple polynomial approximation of the sine function with single and double precision respectively. + +The former uses `double` type to do so and the latter uses an `ap_float<11,44, Rnd>`. The `Rnd` rounding mode rounds towards zero. These two kernels will illustrate how to trade off accuracy for lesser FPGA resource utilization. + +See the section *Examining the Reports* to go over the differences in resource utilization between these kernels. See the section *Example of Output* to see the difference in accuracy of results produced by these kernels. + +Note how the kernel function within `RunSineApproximationKernel()` has been written once and the individual kernels are only differentiated by their input/output data types: `ApproximateSineWithDouble` uses `double` data type and `ApproximateSineWithAPFLoat` uses `ap_float` data type. + +```cpp +// Approximate sine with native double type +RunSineApproximationKernel(q, input, + double_result); +... +constexpr auto Rnd = ihc::fp_config::FP_Round::RZERO; +using ap_float_double = ihc::ap_float<11, 44, Rnd>; + +// Approximate sine with `ap_float` type +RunSineApproximationKernel( + q, ap_float_input, ap_float_result); +``` + +This code-reuse is because `ap_float` is designed to fully blend in with native C++ types for syntax and semantics. + ## Conversion Between Native Types and `ap_float` In normal floating-point FPGA applications, floating-point literals are represented as compile-time constants and implemented as tie-offs (wires that directly connects to Gnd/Vcc) in RTL. This allows the construction of a constant to use no hardware resources in the FPGA flow. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp index 7513f56ffb..7712d57112 100644 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp @@ -20,6 +20,9 @@ using namespace sycl; // Forward declare the kernel name in the global scope. // This is a FPGA best practice that reduces name mangling in the optimization // reports. +class ApproximateSineWithDouble; +class ApproximateSineWithAPFloat; + class ConversionKernelA; class ConversionKernelB; class ConversionKernelC; @@ -27,6 +30,11 @@ class ConversionKernelC; class SimpleQuadraticEqnSolverKernel; class SpecializedQuadraticEqnSolverKernel; +// The number of terms in the polynomial approximation of the sine function +constexpr int kSineApproximateTermsCount = 10; + +constexpr double kSineApproximationEpsilon = 1e-13; + // ap_float< 8,23> has the same number of exponent and mantissa bits as native // float type using APFloatType = ihc::ap_float<8, 23>; @@ -48,6 +56,103 @@ using APDoubleTypeB = ihc::ap_float<11, 52, kRoundingModeRZERO>; constexpr auto kRoundingModeRNE = ihc::fp_config::FP_Round::RNE; +// -------------------------------------------------------------------------- // +// Polynomial Sine Approximation example +// -------------------------------------------------------------------------- // + +// The function template to generate sine-approximation kernels with different +// floating data types +template +void SineApproximationKernel(queue &q, const T &input, T &output) { + buffer inp_buffer(&input, 1); + buffer res_buffer(&output, 1); + + q.submit([&](handler &h) { + accessor x{inp_buffer, h, read_only}; + accessor retval{res_buffer, h, write_only, no_init}; + + h.single_task([=] { + T res = 0.0; + T sign = 1.0; + T term = x[0]; + T numerator = x[0]; + T denom = 1.0; + +#pragma unroll + for (int i = 1; i <= kSineApproximateTermsCount; ++i) { + res += term; + sign = -sign; + denom *= 2 * i * (2 * i + 1); + numerator *= x[0] * x[0]; + term = sign * numerator / denom; + } + retval[0] = res; + }); + }); +} + +bool TestSineApproximation(queue &q) { + bool passed_native = false; + bool passed_non_native = false; + bool passed_comparison = false; + + std::cout << "Testing basic arithmetic operators to approximate the sine " + "function\n\n"; + + double input = M_PI_4; // pi / 4 + double expected = + M_SQRT1_2; // 1/square_root(2), it is the value of sin(input); + double double_result; + + // Approximate with native double type + SineApproximationKernel(q, input, + double_result); + + // Approximate with ap_float type + // We set the rounding mode to RZERO (truncate to zero) because this allows us + // to generate compile-time ap_float constants from double type literals shown + // below, which eliminates the area usage for initialization. + using APDoubleTypeC = ihc::ap_float<11, 44, kRoundingModeRZERO>; + + APDoubleTypeC ap_float_input = (APDoubleTypeC)input; + APDoubleTypeC ap_float_result; + + SineApproximationKernel( + q, ap_float_input, ap_float_result); + + double difference_a = std::abs(double_result - expected); + double difference_b = std::abs((double)ap_float_result - expected); + + std::cout << "Native Type Result:\n"; + std::cout << "Result = " << std::setprecision(3) << (double)double_result + << "\n"; + std::cout << "Expected = " << std::setprecision(3) << (double)expected + << "\n"; + std::cout << "Difference = " << std::setprecision(3) << (double)difference_a + << "\n\n"; + + std::cout << "Non Native Type Result:\n"; + std::cout << "Result = " << std::setprecision(3) + << (double)ap_float_result << "\n"; + std::cout << "Expected = " << std::setprecision(3) << (double)expected + << "\n"; + std::cout << "Difference = " << std::setprecision(3) << (double)difference_b + << "\n"; + + passed_native = (difference_a < kSineApproximationEpsilon); + passed_non_native = (difference_b < kSineApproximationEpsilon); + passed_comparison = (difference_a < difference_b); + + std::cout << "\nSine Approximation: "; + if (passed_native && passed_comparison && passed_comparison) { + std::cout << "PASSED\n\n"; + return true; + } else { + std::cout << "FAILED\n\n"; + return false; + } +} + // -------------------------------------------------------------------------- // // Rounding Mode and native type to ap_float type conversion examples // -------------------------------------------------------------------------- // @@ -419,6 +524,7 @@ int main() { // Create the SYCL device queue queue q(selector, dpc_common::exception_handler); + passed &= TestSineApproximation(q); passed &= TestAllConversionKernels(q); passed &= TestQuadraticEquationSolverKernels(q); From d63d8772d423f9e1333ca2a4d70e3ef9f9d6f16f Mon Sep 17 00:00:00 2001 From: "Wang, Di5" Date: Thu, 5 May 2022 08:53:10 -0700 Subject: [PATCH 7/7] change the gist of the design ApproximateSine --- .../Tutorials/Features/ap_float/README.md | 68 +++++++++++++------ .../Features/ap_float/src/ap_float.cpp | 60 +++++++--------- 2 files changed, 74 insertions(+), 54 deletions(-) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md index 07f301e6e1..fffad1822e 100755 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/README.md @@ -11,14 +11,14 @@ The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programmi | OS | Linux* Ubuntu* 18.04/20.04, RHEL*/CentOS* 8, SUSE* 15; Windows* 10 | Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA
Intel® FPGA Programmable Acceleration Card (PAC) D5005 (with Intel Stratix® 10 SX)
Intel® FPGA 3rd party / custom platforms with oneAPI support
*__Note__: Intel® FPGA PAC hardware is only compatible with Ubuntu 18.04* | Software | Intel® oneAPI DPC++ Compiler
Intel® FPGA Add-On for oneAPI Base Toolkit -| What you will learn | Including and using the `ap_float` type
Using `ap_float` type to trade off mathematical accuracy for lesser resource utilization
Using various `ap_float` rounding modes and their effect on accuracy and resource utilization
Using the `ap_float `math functions for better quality of results +| What you will learn | Including and using the `ap_float` type
Using various `ap_float` rounding modes and their effect on accuracy and resource utilization
Using the `ap_float `math functions for better quality of results | Time to complete | 1 hour ## Purpose This FPGA tutorial shows how to use the `ap_float` type with some simple examples and recommended best practices. -This data-type can be used in place of native floating point types to generate area efficient and optimized designs for the FPGA. For example, operations which do not utilize all the bits of the native types or designs which do not require all of the range and precision of native types are good candidates for replacement with the `ap_float` type. +`ap_float` can be used in place of native floating point types which do not utilize all the bits of the native types or designs which do not require all of the range and precision of native types. It can also be used when porting floating-point parameterizations (like bfloat16) in an existing design to oneAPI implementations. This tutorial will present the following: 1. How to include the `ap_float` type and an overview of common `ap_float` use cases. @@ -34,6 +34,21 @@ ihc::ap_float a; ``` which consists of `E+M+1` bits: one sign bit, `E` exponent bits and `M` mantissa bits. For example, `ap_float<8,23>` has the same number of exponent and mantissa bits as native `float`, and `ap_float<11,52>` has the same number of exponent and mantissa bits as native `double`. +Currently the precisons that are supported in `ap_float` include: + +```cpp +ap_float<8, 7>, // same E and M widths as bfloat +ap_float<5, 10>, // same E and M widths as half precision +ap_float<8, 10>, +ap_float<8, 17>, +ap_float<8, 23>, // same E and M as the native float type +ap_float<8, 26>, +ap_float<10, 35>, +ap_float<11, 44>, +ap_float<11, 52>, // same E and M as the native double type +ap_float<15, 63> // Extended double precision +``` + Optionally, another template parameter can be specified to set the rounding mode. For more details please refer to the section [*Declare the ap_float Data Type*](https://www.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide/top/optimize-your-design/resource-use/data-types-and-operations/var-prec-fp-sup/declare-and-use-the-ac-data-types/declare-the-ap-float-data-type.html) in the Intel® oneAPI DPC++ FPGA Optimization Guide. To use this type in your code, you must include the following header: @@ -54,37 +69,34 @@ You can easily convert your existing designs that use native floating-point type ## Overview of Common Use Cases for `ap_float` -You should consider migrating to `ap_float` types when you have precision requirements that differ from native `float` and `double` types, including both the range (number of exponent bits) and precision (number of mantissa bits) metrics. - -Starting from oneAPI 2021.2 release, Intel® oneAPI DPC++ Compiler enables fast math by default, which allows relatively aggressive floating point math optimizations for `float` and `double`. These optimizations cause results that don't conform with the ANSI standard (as oneAPI 2021.1 release and GCC do), which trade-off precision for performance and area. - -To achieve double precision that adheres to the ANSI conformance, you must pass the flag `-no-fma -fp-model=precise` (Linux) / `/Qfma- /fp:precise` (Windows) to the `dpcpp` command when compiling your SYCL program. However, double precision operations cannot be placed into a single hardened DSP block like single-precision operations, so double precision operations are significantly more area intensive and use more hardware resources. Moreover, `float` only has 23 bits of mantissa while `double` has 52, this could be an overkill for applications that only seek a sweet spot in between. +You should consider migrating to `ap_float` types when you have precision requirements that differ from native `float` and `double` types, including both the range (number of exponent bits) and precision (number of mantissa bits) metrics. You can also use `ap_float` to migrate existing designs that contain non-native floating-point types to oneAPI implementations. Additionally, the built in subnormal support with native `double` type is area intensive and being able to turn subnormal support off can be great for reducing area utilization if the application does not consider very small subnormal numbers. Finally, the various rounding modes offered along with the `ap_float` type can help trade-off mathematical accuracy for FPGA resource utilization. -## Trading Off Mathematical Accuracy for Better Resource Utilization +## Using `ap_float` for Non-native Floating Point Parameterization -In this tutorial, the template function `RunSineApproximationKernel()` instantiates two kernels `ApproximateSineWithDouble` and `ApproximateSineWithAPFloat`, which implement a simple polynomial approximation of the sine function with single and double precision respectively. +In this tutorial, the template function `RunSineApproximationKernel()` instantiates two kernels `ApproximateSineWithAPFloat_11_44` and `ApproximateSineWithBFloat16`, which implement a simple polynomial approximation of the sine function with `ap_float<11, 44>` and `ap_float<8, 7>` respectively. Note that the header file `ap_float.hpp` defines the type alias of `ap_float<8, 7>` as -The former uses `double` type to do so and the latter uses an `ap_float<11,44, Rnd>`. The `Rnd` rounding mode rounds towards zero. These two kernels will illustrate how to trade off accuracy for lesser FPGA resource utilization. +```cpp +using bfloat16 = ap_float<8, 7, fp_config::FP_Round::RNE>; +``` See the section *Examining the Reports* to go over the differences in resource utilization between these kernels. See the section *Example of Output* to see the difference in accuracy of results produced by these kernels. -Note how the kernel function within `RunSineApproximationKernel()` has been written once and the individual kernels are only differentiated by their input/output data types: `ApproximateSineWithDouble` uses `double` data type and `ApproximateSineWithAPFLoat` uses `ap_float` data type. +Note how the kernel function within `RunSineApproximationKernel()` has been written once and the individual kernels are only differentiated by their input/output data types: `ApproximateSineWithAPFloat_11_44` uses `ap_float<11, 44>` data type and `ApproximateSineWithBFloat16` uses `bfloat16` data type. ```cpp -// Approximate sine with native double type -RunSineApproximationKernel(q, input, - double_result); +// Approximate sine with `ap_float<11, 44>` type +using APFloat_11_44 = ihc::ap_float<11, 44>; +SineApproximationKernel( + q, (APFloat_11_44)input, ap_float_result); ... -constexpr auto Rnd = ihc::fp_config::FP_Round::RZERO; -using ap_float_double = ihc::ap_float<11, 44, Rnd>; -// Approximate sine with `ap_float` type -RunSineApproximationKernel( - q, ap_float_input, ap_float_result); +// Approximate sine with `bfloat16` type +SineApproximationKernel( + q, (ihc::bfloat16)input, bfloat_result); ``` This code-reuse is because `ap_float` is designed to fully blend in with native C++ types for syntax and semantics. @@ -310,6 +322,12 @@ Locate the pair of `report.html` files in either: * **Report-only compile**: `ap_float_report.prj` * **FPGA hardware compile**: `ap_float.prj` +### Examining the Area Reports for the Sine Approximation Kernels + +Navigate to the *Area Estimates* page. Click on the *Kernel System* line to expand it. Observe the difference in resource utilization of the kernels `ApproximateSineWithAPFloat_11_44` and `ApproximateSineWithBFloat16`. + +Expand the lines with the kernel names by clicking on them and expand the sub hierarchies to observe the resource utilization of each arithmetic operation. + ### Examining the Reports for Conversion Kernels You can find the usages of conversion in both the *Area Estimates* and the *System Viewer* report . The name of the rounding block is "cast". @@ -384,6 +402,18 @@ You should also observe a significant area estimation reduction of the divider f ### Example of Output ```txt +ap_float<11,44> Result: +Result = 0.707 +Expected = 0.707 +Difference = 5.12e-14 + +bfloat16 Result: +Result = 0.707 +Expected = 0.707 +Difference = 7.55e-05 + +Sine Approximation: PASSED + Testing conversions in ap_float Result = 76.8 Expected = 76.8 diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp index 7712d57112..ae700899f9 100644 --- a/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ap_float/src/ap_float.cpp @@ -20,8 +20,8 @@ using namespace sycl; // Forward declare the kernel name in the global scope. // This is a FPGA best practice that reduces name mangling in the optimization // reports. -class ApproximateSineWithDouble; -class ApproximateSineWithAPFloat; +class ApproximateSineWithAPFloat_11_44; +class ApproximateSineWithBFloat16; class ConversionKernelA; class ConversionKernelB; @@ -33,8 +33,6 @@ class SpecializedQuadraticEqnSolverKernel; // The number of terms in the polynomial approximation of the sine function constexpr int kSineApproximateTermsCount = 10; -constexpr double kSineApproximationEpsilon = 1e-13; - // ap_float< 8,23> has the same number of exponent and mantissa bits as native // float type using APFloatType = ihc::ap_float<8, 23>; @@ -61,7 +59,7 @@ constexpr auto kRoundingModeRNE = ihc::fp_config::FP_Round::RNE; // -------------------------------------------------------------------------- // // The function template to generate sine-approximation kernels with different -// floating data types +// floating-point data types template void SineApproximationKernel(queue &q, const T &input, T &output) { buffer inp_buffer(&input, 1); @@ -101,50 +99,42 @@ bool TestSineApproximation(queue &q) { double input = M_PI_4; // pi / 4 double expected = - M_SQRT1_2; // 1/square_root(2), it is the value of sin(input); - double double_result; - - // Approximate with native double type - SineApproximationKernel(q, input, - double_result); - - // Approximate with ap_float type - // We set the rounding mode to RZERO (truncate to zero) because this allows us - // to generate compile-time ap_float constants from double type literals shown - // below, which eliminates the area usage for initialization. - using APDoubleTypeC = ihc::ap_float<11, 44, kRoundingModeRZERO>; + M_SQRT1_2; // 1/square_root(2), it is the value of sin(pi / 4); - APDoubleTypeC ap_float_input = (APDoubleTypeC)input; - APDoubleTypeC ap_float_result; + // Approximate with the ap_float type with 11 exponent bits and 44 mantissa + // bits + using APFloat_11_44 = ihc::ap_float<11, 44>; + APFloat_11_44 ap_float_result; - SineApproximationKernel( - q, ap_float_input, ap_float_result); + SineApproximationKernel( + q, (APFloat_11_44)input, ap_float_result); + double difference_a = std::abs((double)ap_float_result - expected); - double difference_a = std::abs(double_result - expected); - double difference_b = std::abs((double)ap_float_result - expected); - - std::cout << "Native Type Result:\n"; - std::cout << "Result = " << std::setprecision(3) << (double)double_result - << "\n"; + std::cout << "ap_float<11,44> Result:\n"; + std::cout << "Result = " << std::setprecision(3) + << (double)ap_float_result << "\n"; std::cout << "Expected = " << std::setprecision(3) << (double)expected << "\n"; std::cout << "Difference = " << std::setprecision(3) << (double)difference_a << "\n\n"; - std::cout << "Non Native Type Result:\n"; - std::cout << "Result = " << std::setprecision(3) - << (double)ap_float_result << "\n"; + // Approximate with bfloat16, which is an alias of type ap_float<8,7> + ihc::bfloat16 bfloat_result; + SineApproximationKernel( + q, (ihc::bfloat16)input, bfloat_result); + double difference_b = std::abs((double)bfloat_result - expected); + + std::cout << "bfloat16 Result:\n"; + std::cout << "Result = " << std::setprecision(3) << (double)bfloat_result + << "\n"; std::cout << "Expected = " << std::setprecision(3) << (double)expected << "\n"; std::cout << "Difference = " << std::setprecision(3) << (double)difference_b << "\n"; - passed_native = (difference_a < kSineApproximationEpsilon); - passed_non_native = (difference_b < kSineApproximationEpsilon); - passed_comparison = (difference_a < difference_b); - + // Approximation with float is more accurate than approximation with bfloat std::cout << "\nSine Approximation: "; - if (passed_native && passed_comparison && passed_comparison) { + if (difference_a < difference_b) { std::cout << "PASSED\n\n"; return true; } else {