Skip to content
Snippets Groups Projects
Commit 573c5178 authored by Sioni Summers's avatar Sioni Summers
Browse files

Add the firmware exercise. Not yet working

parent e19b70a2
No related branches found
No related tags found
No related merge requests found
File moved
#ifndef L1TMLDEMO_DEFINES_H_
#define L1TMLDEMO_DEFINES_H_
#include "ap_fixed.h"
// these types all match the CMSSW emulator
static const int N_INPUTS=56;
typedef ap_fixed<14,13> unscaled_t;
typedef ap_fixed<16,7,AP_RND,AP_SAT> scaled_t;
typedef ap_fixed<16,6> scale_t;
typedef ap_fixed<16,6> bias_t;
#endif
\ No newline at end of file
#include "defines.h"
#include "scales.h"
void scaler(unscaled_t X_unscaled[N_INPUTS], scaled_t X_scaled[N_INPUTS]){
#pragma HLS pipeline II=1
#pragma HLS array_partition variable=X_unscaled complete // leave this interface as an array
#pragma HLS array_reshape variable=X_scaled complete dim=0 // make an hls4ml-style wide interface
for(int i = 0; i < N_INPUTS; i++){
X_scaled[i] = (X_unscaled[i] - bias[i]) * scale[i];
}
}
\ No newline at end of file
open_project -reset scaler_prj
set_top scaler
add_files scaler.cpp
open_solution -reset "solution1"
set_part xcvu13p-flga2577-2-e
create_clock -period 2.5 -name default
csynth_design
quit
\ No newline at end of file
# algorithm includes
include -c NN nn.dep
include -c Scaler scaler.dep
src NNWrapper.vhd
src ScalerWrapper.vhd
src payload.vhd
# emp fwk includes
include -c emp-fwk:boards/serenity/dc_vu13p dc_vu13p_so2.dep
src emp_project_decl.vhd
#addrtab emp_payload.xml
library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.numeric_std.all;
library NNLib;
use work.emp_data_types.all;
use work.emp_project_decl.all;
entity NNWrapper is
port(
clk : in std_logic;
X_scaled : in std_logic_vector(895 downto 0);
X_vld : in std_logic;
y : out std_logic_vector(12 downto 0);
y_vld : out std_logic
);
end NNWrapper;
architecture rtl of NNWrapper is
begin
-- make an instance of the Neural Network
NNInstance : NNLib.L1TMLDemo_v1
port map(
ap_clk => clk,
ap_rst => '0', -- never reset
ap_start => '1', -- always run
input_1_V_ap_vld => X_scaled,
input_1_V => X_slv,
layer13_out_0_V => y,
layer13_out_0_V_ap_vld => y_vld
);
end rtl;
\ No newline at end of file
library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.numeric_std.all;
--library ScalerLib;
use work.emp_data_types.all;
use work.emp_project_decl.all;
use work.emp_device_decl.all;
entity ScalerWrapper is
port(
clk : in std_logic;
d : in ldata(4 * N_REGION - 1 downto 0); -- data in
X_scaled : out std_logic_vector(895 downto 0);
X_vld : out std_logic
);
end ScalerWrapper;
architecture rtl of ScalerWrapper is
begin
-- make an instance of the Standard Scaler
ScalerInstance : scaler
port map(
ap_clk => clk,
ap_rst => '0', -- never reset
ap_start => d(0).valid, -- run whenever the MET link is valid
-- MET
X_unscaled_0_V => d(0).data(13 downto 0),
X_unscaled_1_V => d(0).data(25 downto 14),
X_unscaled_2_V => d(1).data(13 downto 0),
X_unscaled_3_V => d(1).data(25 downto 14),
X_unscaled_4_V => d(1).data(41 downto 26),
X_unscaled_5_V => d(2).data(13 downto 0),
X_unscaled_6_V => d(2).data(25 downto 14),
X_unscaled_7_V => d(2).data(41 downto 26),
X_unscaled_8_V => d(3).data(13 downto 0),
X_unscaled_9_V => d(3).data(25 downto 14),
X_unscaled_10_V => d(3).data(41 downto 26),
X_unscaled_11_V => d(4).data(13 downto 0),
X_unscaled_12_V => d(4).data(25 downto 14),
X_unscaled_13_V => d(4).data(41 downto 26),
X_unscaled_14_V => d(5).data(13 downto 0),
X_unscaled_15_V => d(5).data(25 downto 14),
X_unscaled_16_V => d(5).data(41 downto 26),
X_unscaled_17_V => d(6).data(13 downto 0),
X_unscaled_18_V => d(6).data(25 downto 14),
X_unscaled_19_V => d(6).data(41 downto 26),
X_unscaled_20_V => d(7).data(13 downto 0),
X_unscaled_21_V => d(7).data(25 downto 14),
X_unscaled_22_V => d(7).data(41 downto 26),
X_unscaled_23_V => d(8).data(13 downto 0),
X_unscaled_24_V => d(8).data(25 downto 14),
X_unscaled_25_V => d(8).data(41 downto 26),
X_unscaled_26_V => d(9).data(13 downto 0),
X_unscaled_27_V => d(9).data(25 downto 14),
X_unscaled_28_V => d(9).data(41 downto 26),
X_unscaled_29_V => d(10).data(13 downto 0),
X_unscaled_30_V => d(10).data(25 downto 14),
X_unscaled_31_V => d(10).data(41 downto 26),
X_unscaled_32_V => d(11).data(13 downto 0),
X_unscaled_33_V => d(11).data(25 downto 14),
X_unscaled_34_V => d(11).data(41 downto 26),
X_unscaled_35_V => d(12).data(13 downto 0),
X_unscaled_36_V => d(12).data(25 downto 14),
X_unscaled_37_V => d(12).data(41 downto 26),
X_unscaled_38_V => d(13).data(13 downto 0),
X_unscaled_39_V => d(13).data(25 downto 14),
X_unscaled_40_V => d(13).data(41 downto 26),
X_unscaled_41_V => d(14).data(13 downto 0),
X_unscaled_42_V => d(14).data(25 downto 14),
X_unscaled_43_V => d(14).data(41 downto 26),
X_unscaled_44_V => d(15).data(13 downto 0),
X_unscaled_45_V => d(15).data(25 downto 14),
X_unscaled_46_V => d(15).data(41 downto 26),
X_unscaled_47_V => d(112).data(13 downto 0),
X_unscaled_48_V => d(112).data(25 downto 14),
X_unscaled_49_V => d(112).data(41 downto 26),
X_unscaled_50_V => d(113).data(13 downto 0),
X_unscaled_51_V => d(113).data(25 downto 14),
X_unscaled_52_V => d(113).data(41 downto 26),
X_unscaled_53_V => d(114).data(13 downto 0),
X_unscaled_54_V => d(114).data(25 downto 14),
X_unscaled_55_V => d(114).data(41 downto 26),
X_scaled_V => X_scaled,
X_scaled_V_ap_vld => X_vld
);
end rtl;
\ No newline at end of file
-- emp_project_decl for the VU13P Daughter Card modified example design
--
-- Defines constants for the whole project
--
library IEEE;
use IEEE.STD_LOGIC_1164.all;
use work.emp_framework_decl.all;
use work.emp_device_types.all;
use work.emp_slink_types.all;
package emp_project_decl is
constant PAYLOAD_REV : std_logic_vector(31 downto 0) := X"70900000";
-- Latency buffer size
constant LB_ADDR_WIDTH : integer := 10;
-- Clock setup
constant CLOCK_COMMON_RATIO : integer := 36;
constant CLOCK_RATIO : integer := 9;
constant CLOCK_AUX_DIV : clock_divisor_array_t := (18, 9, 4); -- Dividers of CLOCK_COMMON_RATIO * 40 MHz
-- Only used by nullalgo
constant PAYLOAD_LATENCY : integer := 5;
constant REGION_CONF : region_conf_array_t := (
0 => (gty25, buf, no_fmt, buf, gty25),
1 => (gty25, buf, no_fmt, no_buf, no_mgt),
2 => (gty25, buf, no_fmt, no_buf, no_mgt),
3 => (gty25, buf, no_fmt, no_buf, no_mgt),
4 => kDummyRegion, -- HighSpeedBus
5 => kDummyRegion, -- PCIe, AXI & TCDS
6 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
7 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
8 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
9 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
10 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
11 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
12 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
13 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
14 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
15 => kDummyRegion, -- Unconnected
-- Cross-chip
16 => kDummyRegion, -- Unconnected
17 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
18 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
19 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
20 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
21 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
22 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
23 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
24 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
25 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
26 => kDummyRegion, -- Unconnected
27 => kDummyRegion, -- HighSpeedBus
28 => (gty25, buf, no_fmt, no_buf, no_mgt),
29 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
30 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
31 => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
others => kDummyRegion
);
-- Specify the slink quad using the corresponding region conf ID
-- Specify slink channels to enable using the channel mask
constant SLINK_CONF : slink_conf_array_t := (
others => kNoSlink
);
end emp_project_decl;
library IEEE;
use IEEE.STD_LOGIC_1164.ALL;
use work.ipbus.all;
use work.emp_data_types.all;
use work.emp_project_decl.all;
use work.emp_device_decl.all;
use work.emp_ttc_decl.all;
use work.emp_slink_types.all;
entity emp_payload is
port(
clk : in std_logic; -- ipbus signals
rst : in std_logic;
ipb_in : in ipb_wbus;
ipb_out : out ipb_rbus;
clk40 : in std_logic;
clk_payload : in std_logic_vector(2 downto 0);
rst_payload : in std_logic_vector(2 downto 0);
clk_p : in std_logic; -- data clock
rst_loc : in std_logic_vector(N_REGION - 1 downto 0);
clken_loc : in std_logic_vector(N_REGION - 1 downto 0);
ctrs : in ttc_stuff_array;
bc0 : out std_logic;
d : in ldata(4 * N_REGION - 1 downto 0); -- data in
q : out ldata(4 * N_REGION - 1 downto 0); -- data out
gpio : out std_logic_vector(29 downto 0); -- IO to mezzanine connector
gpio_en : out std_logic_vector(29 downto 0); -- IO to mezzanine connector (three-state enables)
slink_q : out slink_input_data_quad_array(SLINK_MAX_QUADS-1 downto 0);
backpressure : in std_logic_vector(SLINK_MAX_QUADS-1 downto 0)
);
end emp_payload;
architecture rtl of emp_payload is
signal X_scaled : std_logic_vector(895 downto 0) := (others => '0');
signal X_scaled_vld : std_logic := '0';
signal y : std_logic_vector(12 downto 0) := (others => '0');
signal y_vld : std_logic := '0';
begin
-- scale the inputs
ScalerInstance : work.ScalerWrapper
port map(
clk => clk_p,
d => d_delayed,
X_scaled => X_scaled,
X_scaled_vld => X_scaled_vld
);
-- run the NN
NNInstance : work.NNWrapper
port map(
clk => clk_p,
X_scaled => X_scaled,
X_vld => X_scaled_vld,
y => y,
y_vld => y_vld
);
-- hook up the output
q(0).data(12 downto 0) <= y;
q(0).valid <= y_vld;
ipb_out <= IPB_RBUS_NULL;
bc0 <= '0';
gpio <= (others => '0');
gpio_en <= (others => '0');
end rtl;
\ No newline at end of file
import pickle
import os
# read the standard scaler from part 1 and write the values to a C array for the HLS module
scaler_file = os.environ['MLATL1T_DIR'] + '/part1_outputs/hwScaler.pkl'
scaler = pickle.load(open(scaler_file, 'rb'))
scales = 1. / scaler.scale_
bias = scaler.mean_
template = '''
#ifndef L1TMLDEMO_SCALES_H_
#define L1TMLDEMO_SCALES_H_
#include "defines.h"
static const scale_t scale[N_INPUTS] = {{{}}};
static const bias_t bias[N_INPUTS] = {{{}}};
#endif
'''
with open(os.environ['MLATL1T_DIR'] + '/part3/firmware/Scaler/hls/scales.h', 'w') as f:
scales_txt = ','.join(str(s) for s in scales)
bias_txt = ','.join(str(b) for b in bias)
header = template.format(scales_txt, bias_txt)
f.write(header)
\ No newline at end of file
# Building Phase 2 FPGA bitfile with NN
We will target a Serenity board, one of the CMS Phase 2 L1T boards. While some of the configuration is specific to the Serenity, some other Phase 2 L1T boards also use the `emp-fwk` and the same workflow.
Other boards like APx have a different workflow, but the concepts are similar.
This part of the tutorial will take too long to run through in the time available during the live session.
The FPGA part used by the Serenity board is not installed on the Vivado installation we're using from `lxplus` either. Instead we will show a demonstration of the results. In general running synthesis and implementation for the Virtex Ultrascale+ FPGAs that will be used in the CMS Phase 2 L1T can take several hours or even days, and benefits from access to high single-core performance, high memory machines rather than shared resources like `lxplus`. We also recommend that you use more recent versions of Vivado than are available on the image used for the tutorial.
## Prerequisites
Accessing some of the repositories required for building firmware requires permissions. Join the `cms-cactus` and `cms-tcds2-users` e-groups before starting, and allow some hours for the synchronisation of the permissions databases.
You also need to have synthesized the hls4ml NN from part 2.
## 1. Create Standard Scaler HLS IP
We will deploy our NN as one module in the 'payload', and the Standard Scaler as a separate module. We've provided a Python script to read the values from the Standard Scaler and write some valid HLS code to do the scaling.
The VHDL interface of the hls4ml NN is also not very nice, the X data port is 'flattened' over the inputs, resulting in a 56 * 16 bit = 896 bit wide vector like this:
```vhdl
input_1_V : IN STD_LOGIC_VECTOR (895 downto 0);
```
HLS gives us control over this interface, so we will also use the Standard Scaler HLS as an 'adapter' from an array of 56 inputs to this 896 bit wide input.
This could also be achieved in VHDL with a `generate` statement, or by modifying the `#pragma hls interface` settings of the hls4ml function.
```shell
cd $MLATL1T_DIR/part3
python make_scaler_hls.py
```
This creates the file `$MLATL1T_DIR/part3/Scaler/firmware/hls/scales.h` with contents like:
```c++
#ifndef L1TMLDEMO_SCALES_H_
#define L1TMLDEMO_SCALES_H_
#include "defines.h"
static const scale_t scale[N_INPUTS] = {... scale values};
static const bias_t bias[N_INPUTS] = {... bias values};
#endif
```
Then we need to synthesize the scaler HLS:
```shell
cd $MLATL1T_DIR/part3/Scaler/firmware/hls
vivado_hls -f synth.tcl
```
## 2. Setup IPBus Builder workspace
This will clone several Gitlab repositories to the directory `$MLATL1T_DIR/part3/p2fwk-work/`
```shell
bash setup_fw_workspace.sh
```
After completion the output of `tree $MLATL1T_DIR/part3/p2fwk-work/ -L 2` should be:
```shell
$ tree . -L 2
$MLATL1T_DIR/part3/p2fwk-work/
├── proj
└── src
├── cms-tcds2-firmware
├── emp-fwk
├── ipbus-firmware
├── L1TMLDemo
├── legacy_ttc
├── slinkrocket
├── slinkrocket_ips
└── tclink
```
## 3. Create `ipbb` project
```shell
cd $MLATL1T_DIR/part3/p2fwk-work/
cp src/emp-fwk/components/ttc/firmware/hdl/ipbus_decode_ipbus_tcds2_interface_accessor.vhd src/cms-tcds2-firmware/components/tcds2_interface/firmware/hdl/
ipbb proj create vivado L1TMLDemo L1TMLDemo:payload top_serenity.dep
#ipbb ipbus gendecoders
cd proj/L1TMLDemo
ipbb vivado generate-project --single
```
## 4. Build `ipbb` project
Launch the synthesis and implementation, this will take a few hours!
```shell
ipbb vivado synth -j8 impl -j8
ipbb vivado package
```
## 5. Make a pattern file
Format some hardware input data to hex-formatted columnar file representing data on optical links.
This file will be loaded into buffers (BRAMs) next to the transceiver in the FPGA fabric, mimiccing data arriving from CMS or another L1T board.
## 6. Run on Serenity
\ No newline at end of file
pip install https://github.com/ipbus/ipbb/archive/dev/2023a.tar.gz
ipbb init $MLATL1T_DIR/part3/p2fwk-work
cd $MLATL1T_DIR/part3/p2fwk-work
# these are the framework packages required
ipbb add git https://gitlab.cern.ch/p2-xware/firmware/emp-fwk.git
ipbb add git https://gitlab.cern.ch/ttc/legacy_ttc.git -b v2.1
ipbb add git https://gitlab.cern.ch/cms-tcds/cms-tcds2-firmware.git -b v0_1_1
ipbb add git https://gitlab.cern.ch/HPTD/tclink.git -r fda0bcf
ipbb add git https://gitlab.cern.ch/dth_p1-v2/slinkrocket_ips.git -b v03.12
ipbb add git https://gitlab.cern.ch/dth_p1-v2/slinkrocket.git -b v03.12
ipbb add git https://github.com/ipbus/ipbus-firmware -b v1.9
# this is our project code
mkdir $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo
ln -rs $MLATL1T_DIR/part3/firmware/ $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo
# copy the synthesized Neural Network VHDL to the ipbb workspace
mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/hdl
mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/cfg
cp $MLATL1T_DIR/part2_outputs/L1TMLDemo_v1/L1TMLDemo_v1_prj/solution1/syn/vhdl/* $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/hdl/
# make an ipbb dependency file
for f in `ls $MLATL1T_DIR/part2_outputs/L1TMLDemo_v1/L1TMLDemo_v1_prj/solution1/syn/vhdl/`
do
echo "src -l NNLib $f" >> $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/cfg/nn.dep
done
# copy thr synthesized Standard Scaler VHDL to the ipbb workspace
mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/hdl
mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/cfg
cp $MLATL1T_DIR/part3/firmware/Scaler/hls/scaler_prj/solution1/syn/vhdl/* $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/hdl/
for f in `ls $MLATL1T_DIR/part3/firmware/Scaler/hls/scaler_prj/solution1/syn/vhdl`
do
echo "src $f" >> $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/cfg/scaler.dep
done
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment