Add the firmware exercise. Not yet working

573c5178 · Sioni Summers · e19b70a2 · 573c5178 · 573c5178 · 573c5178
Commit 573c5178 authored 1 year ago by Sioni Summers
--- a/part3/exercise.md
+++ b/part3/exercise.md
--- a/part3/firmware/Scaler/hls/defines.h
+++ b/part3/firmware/Scaler/hls/defines.h
+#ifndef L1TMLDEMO_DEFINES_H_
+#define L1TMLDEMO_DEFINES_H_
+
+#include "ap_fixed.h"
+
+// these types all match the CMSSW emulator
+static const int N_INPUTS=56;
+typedef ap_fixed<14,13> unscaled_t;
+typedef ap_fixed<16,7,AP_RND,AP_SAT> scaled_t;
+typedef ap_fixed<16,6> scale_t;
+typedef ap_fixed<16,6> bias_t;
+
+#endif
\ No newline at end of file
--- a/part3/firmware/Scaler/hls/scaler.cpp
+++ b/part3/firmware/Scaler/hls/scaler.cpp
+#include "defines.h"
+#include "scales.h"
+
+void scaler(unscaled_t X_unscaled[N_INPUTS], scaled_t X_scaled[N_INPUTS]){
+  #pragma HLS pipeline II=1
+  #pragma HLS array_partition variable=X_unscaled complete   // leave this interface as an array
+  #pragma HLS array_reshape variable=X_scaled complete dim=0 // make an hls4ml-style wide interface
+  for(int i = 0; i < N_INPUTS; i++){
+    X_scaled[i] = (X_unscaled[i] - bias[i]) * scale[i];
+  }
+}
\ No newline at end of file
--- a/part3/firmware/Scaler/hls/synth.tcl
+++ b/part3/firmware/Scaler/hls/synth.tcl
+open_project -reset scaler_prj
+set_top scaler
+add_files scaler.cpp
+open_solution -reset "solution1"
+set_part xcvu13p-flga2577-2-e
+create_clock -period 2.5 -name default
+csynth_design
+quit
\ No newline at end of file
--- a/part3/firmware/payload/firmware/cfg/top_serenity.dep
+++ b/part3/firmware/payload/firmware/cfg/top_serenity.dep
+# algorithm includes
+include -c NN nn.dep
+include -c Scaler scaler.dep
+src NNWrapper.vhd
+src ScalerWrapper.vhd
+src payload.vhd
+
+# emp fwk includes
+include -c emp-fwk:boards/serenity/dc_vu13p dc_vu13p_so2.dep
+src emp_project_decl.vhd
+#addrtab emp_payload.xml
--- a/part3/firmware/payload/firmware/hdl/NNWrapper.vhd
+++ b/part3/firmware/payload/firmware/hdl/NNWrapper.vhd
+library IEEE;
+use IEEE.std_logic_1164.all;
+use IEEE.numeric_std.all;
+
+library NNLib;
+
+use work.emp_data_types.all;
+use work.emp_project_decl.all;
+
+entity NNWrapper is
+port(
+  clk      : in std_logic;
+  X_scaled : in std_logic_vector(895 downto 0);
+  X_vld    : in std_logic;
+  y        : out std_logic_vector(12 downto 0);
+  y_vld    : out std_logic
+);
+end NNWrapper;
+
+architecture rtl of NNWrapper is
+
+begin 
+
+  -- make an instance of the Neural Network
+  NNInstance : NNLib.L1TMLDemo_v1
+  port map(
+    ap_clk => clk,
+    ap_rst => '0', -- never reset
+    ap_start => '1', -- always run
+    input_1_V_ap_vld => X_scaled,
+    input_1_V => X_slv,
+    layer13_out_0_V => y,
+    layer13_out_0_V_ap_vld => y_vld
+  );
+
+end rtl;
\ No newline at end of file
--- a/part3/firmware/payload/firmware/hdl/ScalerWrapper.vhd
+++ b/part3/firmware/payload/firmware/hdl/ScalerWrapper.vhd
+library IEEE;
+use IEEE.std_logic_1164.all;
+use IEEE.numeric_std.all;
+
+--library ScalerLib;
+
+use work.emp_data_types.all;
+use work.emp_project_decl.all;
+use work.emp_device_decl.all;
+
+entity ScalerWrapper is
+port(
+  clk      : in std_logic;
+  d        : in  ldata(4 * N_REGION - 1 downto 0);  -- data in
+  X_scaled : out std_logic_vector(895 downto 0);
+  X_vld    : out std_logic
+);
+end ScalerWrapper;
+
+architecture rtl of ScalerWrapper is
+
+begin 
+
+  -- make an instance of the Standard Scaler
+  ScalerInstance : scaler
+  port map(
+    ap_clk => clk,
+    ap_rst => '0', -- never reset
+    ap_start => d(0).valid, -- run whenever the MET link is valid
+    -- MET
+    X_unscaled_0_V => d(0).data(13 downto 0),
+    X_unscaled_1_V => d(0).data(25 downto 14),
+
+    X_unscaled_2_V => d(1).data(13 downto 0),
+    X_unscaled_3_V => d(1).data(25 downto 14),
+    X_unscaled_4_V => d(1).data(41 downto 26),
+
+    X_unscaled_5_V => d(2).data(13 downto 0),
+    X_unscaled_6_V => d(2).data(25 downto 14),
+    X_unscaled_7_V => d(2).data(41 downto 26),
+
+    X_unscaled_8_V => d(3).data(13 downto 0),
+    X_unscaled_9_V => d(3).data(25 downto 14),
+    X_unscaled_10_V => d(3).data(41 downto 26),
+
+    X_unscaled_11_V => d(4).data(13 downto 0),
+    X_unscaled_12_V => d(4).data(25 downto 14),
+    X_unscaled_13_V => d(4).data(41 downto 26),
+
+    X_unscaled_14_V => d(5).data(13 downto 0),
+    X_unscaled_15_V => d(5).data(25 downto 14),
+    X_unscaled_16_V => d(5).data(41 downto 26),
+
+    X_unscaled_17_V => d(6).data(13 downto 0),
+    X_unscaled_18_V => d(6).data(25 downto 14),
+    X_unscaled_19_V => d(6).data(41 downto 26),
+
+    X_unscaled_20_V => d(7).data(13 downto 0),
+    X_unscaled_21_V => d(7).data(25 downto 14),
+    X_unscaled_22_V => d(7).data(41 downto 26),
+
+    X_unscaled_23_V => d(8).data(13 downto 0),
+    X_unscaled_24_V => d(8).data(25 downto 14),
+    X_unscaled_25_V => d(8).data(41 downto 26),
+
+    X_unscaled_26_V => d(9).data(13 downto 0),
+    X_unscaled_27_V => d(9).data(25 downto 14),
+    X_unscaled_28_V => d(9).data(41 downto 26),
+
+    X_unscaled_29_V => d(10).data(13 downto 0),
+    X_unscaled_30_V => d(10).data(25 downto 14),
+    X_unscaled_31_V => d(10).data(41 downto 26),
+
+    X_unscaled_32_V => d(11).data(13 downto 0),
+    X_unscaled_33_V => d(11).data(25 downto 14),
+    X_unscaled_34_V => d(11).data(41 downto 26),
+
+    X_unscaled_35_V => d(12).data(13 downto 0),
+    X_unscaled_36_V => d(12).data(25 downto 14),
+    X_unscaled_37_V => d(12).data(41 downto 26),
+
+    X_unscaled_38_V => d(13).data(13 downto 0),
+    X_unscaled_39_V => d(13).data(25 downto 14),
+    X_unscaled_40_V => d(13).data(41 downto 26),
+
+    X_unscaled_41_V => d(14).data(13 downto 0),
+    X_unscaled_42_V => d(14).data(25 downto 14),
+    X_unscaled_43_V => d(14).data(41 downto 26),
+
+    X_unscaled_44_V => d(15).data(13 downto 0),
+    X_unscaled_45_V => d(15).data(25 downto 14),
+    X_unscaled_46_V => d(15).data(41 downto 26),
+
+    X_unscaled_47_V => d(112).data(13 downto 0),
+    X_unscaled_48_V => d(112).data(25 downto 14),
+    X_unscaled_49_V => d(112).data(41 downto 26),
+
+    X_unscaled_50_V => d(113).data(13 downto 0),
+    X_unscaled_51_V => d(113).data(25 downto 14),
+    X_unscaled_52_V => d(113).data(41 downto 26),
+
+    X_unscaled_53_V => d(114).data(13 downto 0),
+    X_unscaled_54_V => d(114).data(25 downto 14),
+    X_unscaled_55_V => d(114).data(41 downto 26),
+
+    X_scaled_V => X_scaled,
+    X_scaled_V_ap_vld => X_vld
+  );
+
+end rtl;
\ No newline at end of file
--- a/part3/firmware/payload/firmware/hdl/emp_project_decl.vhd
+++ b/part3/firmware/payload/firmware/hdl/emp_project_decl.vhd
+-- emp_project_decl for the VU13P Daughter Card modified example design
+--
+-- Defines constants for the whole project
+--
+
+library IEEE;
+use IEEE.STD_LOGIC_1164.all;
+
+use work.emp_framework_decl.all;
+use work.emp_device_types.all;
+use work.emp_slink_types.all;
+
+package emp_project_decl is
+
+  constant PAYLOAD_REV : std_logic_vector(31 downto 0) := X"70900000";
+
+  -- Latency buffer size
+  constant LB_ADDR_WIDTH   : integer := 10;
+
+  -- Clock setup
+  constant CLOCK_COMMON_RATIO : integer               := 36;
+  constant CLOCK_RATIO        : integer               := 9;
+  constant CLOCK_AUX_DIV      : clock_divisor_array_t := (18, 9, 4); -- Dividers of CLOCK_COMMON_RATIO * 40 MHz
+
+  -- Only used by nullalgo
+  constant PAYLOAD_LATENCY : integer := 5;
+
+  constant REGION_CONF : region_conf_array_t := (
+    0      => (gty25, buf, no_fmt, buf, gty25),
+    1      => (gty25, buf, no_fmt, no_buf, no_mgt),
+    2      => (gty25, buf, no_fmt, no_buf, no_mgt),
+    3      => (gty25, buf, no_fmt, no_buf, no_mgt),
+    4      => kDummyRegion,             -- HighSpeedBus
+    5      => kDummyRegion,             -- PCIe, AXI & TCDS
+    6      => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    7      => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    8      => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    9      => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    10     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    11     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    12     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    13     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    14     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    15     => kDummyRegion,             -- Unconnected
+    -- Cross-chip
+    16     => kDummyRegion,             -- Unconnected
+    17     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    18     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    19     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    20     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    21     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    22     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    23     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    24     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    25     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    26     => kDummyRegion,             -- Unconnected
+    27     => kDummyRegion,             -- HighSpeedBus
+    28     => (gty25, buf, no_fmt, no_buf, no_mgt),
+    29     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    30     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    31     => (no_mgt, no_buf, no_fmt, no_buf, no_mgt),
+    others => kDummyRegion
+    );
+
+  -- Specify the slink quad using the corresponding region conf ID
+  -- Specify slink channels to enable using the channel mask
+  constant SLINK_CONF : slink_conf_array_t := (
+    others      => kNoSlink
+    );  
+  
+end emp_project_decl;
--- a/part3/firmware/payload/firmware/hdl/payload.vhd
+++ b/part3/firmware/payload/firmware/hdl/payload.vhd
+library IEEE;
+use IEEE.STD_LOGIC_1164.ALL;
+
+use work.ipbus.all;
+use work.emp_data_types.all;
+use work.emp_project_decl.all;
+
+use work.emp_device_decl.all;
+use work.emp_ttc_decl.all;
+use work.emp_slink_types.all;
+
+entity emp_payload is
+  port(
+    clk         : in  std_logic;        -- ipbus signals
+    rst         : in  std_logic;
+    ipb_in      : in  ipb_wbus;
+    ipb_out     : out ipb_rbus;
+    clk40       : in  std_logic;	
+    clk_payload : in  std_logic_vector(2 downto 0);
+    rst_payload : in  std_logic_vector(2 downto 0);
+    clk_p       : in  std_logic;        -- data clock
+    rst_loc     : in  std_logic_vector(N_REGION - 1 downto 0);
+    clken_loc   : in  std_logic_vector(N_REGION - 1 downto 0);
+    ctrs        : in  ttc_stuff_array;
+    bc0         : out std_logic;
+    d           : in  ldata(4 * N_REGION - 1 downto 0);  -- data in
+    q           : out ldata(4 * N_REGION - 1 downto 0);  -- data out
+    gpio        : out std_logic_vector(29 downto 0);  -- IO to mezzanine connector
+    gpio_en     : out std_logic_vector(29 downto 0);  -- IO to mezzanine connector (three-state enables)
+    slink_q : out slink_input_data_quad_array(SLINK_MAX_QUADS-1 downto 0);
+    backpressure : in std_logic_vector(SLINK_MAX_QUADS-1 downto 0)
+    );
+end emp_payload;
+
+architecture rtl of emp_payload is
+
+  signal X_scaled     : std_logic_vector(895 downto 0) := (others => '0');
+  signal X_scaled_vld : std_logic := '0';
+  signal y            : std_logic_vector(12 downto 0) := (others => '0');
+  signal y_vld        : std_logic := '0';
+
+begin
+
+  -- scale the inputs
+  ScalerInstance : work.ScalerWrapper
+  port map(
+    clk          => clk_p,
+    d            => d_delayed,
+    X_scaled     => X_scaled,
+    X_scaled_vld => X_scaled_vld
+  );
+
+  -- run the NN
+  NNInstance : work.NNWrapper
+  port map(
+    clk      => clk_p,
+    X_scaled => X_scaled,
+    X_vld    => X_scaled_vld,
+    y        => y,
+    y_vld    => y_vld
+  );
+
+  -- hook up the output
+  q(0).data(12 downto 0) <= y;
+  q(0).valid             <= y_vld;
+
+	ipb_out <= IPB_RBUS_NULL;
+	bc0 <= '0';
+	gpio <= (others => '0');
+	gpio_en <= (others => '0');
+
+end rtl;
\ No newline at end of file
--- a/part3/make_scaler_hls.py
+++ b/part3/make_scaler_hls.py
+import pickle
+import os
+
+# read the standard scaler from part 1 and write the values to a C array for the HLS module
+
+scaler_file = os.environ['MLATL1T_DIR'] + '/part1_outputs/hwScaler.pkl'
+scaler = pickle.load(open(scaler_file, 'rb'))
+scales = 1. / scaler.scale_
+bias = scaler.mean_
+
+template = '''
+#ifndef L1TMLDEMO_SCALES_H_
+#define L1TMLDEMO_SCALES_H_
+#include "defines.h"
+
+static const scale_t scale[N_INPUTS] = {{{}}};
+static const bias_t bias[N_INPUTS] = {{{}}};
+
+#endif
+'''
+
+with open(os.environ['MLATL1T_DIR'] + '/part3/firmware/Scaler/hls/scales.h', 'w') as f:
+  scales_txt = ','.join(str(s) for s in scales)
+  bias_txt = ','.join(str(b) for b in bias)
+  header = template.format(scales_txt, bias_txt)
+  f.write(header)
\ No newline at end of file
--- a/part3/serenity_exercise.md
+++ b/part3/serenity_exercise.md
+# Building Phase 2 FPGA bitfile with NN
+
+We will target a Serenity board, one of the CMS Phase 2 L1T boards. While some of the configuration is specific to the Serenity, some other Phase 2 L1T boards also use the `emp-fwk` and the same workflow.
+Other boards like APx have a different workflow, but the concepts are similar.
+
+This part of the tutorial will take too long to run through in the time available during the live session. 
+The FPGA part used by the Serenity board is not installed on the Vivado installation we're using from `lxplus` either. Instead we will show a demonstration of the results. In general running synthesis and implementation for the Virtex Ultrascale+ FPGAs that will be used in the CMS Phase 2 L1T can take several hours or even days, and benefits from access to high single-core performance, high memory machines rather than shared resources like `lxplus`. We also recommend that you use more recent versions of Vivado than are available on the image used for the tutorial.
+
+## Prerequisites
+Accessing some of the repositories required for building firmware requires permissions. Join the `cms-cactus` and `cms-tcds2-users` e-groups before starting, and allow some hours for the synchronisation of the permissions databases.
+
+You also need to have synthesized the hls4ml NN from part 2.
+
+## 1. Create Standard Scaler HLS IP
+
+We will deploy our NN as one module in the 'payload', and the Standard Scaler as a separate module. We've provided a Python script to read the values from the Standard Scaler and write some valid HLS code to do the scaling.
+
+The VHDL interface of the hls4ml NN is also not very nice, the X data port is 'flattened' over the inputs, resulting in a 56 * 16 bit = 896 bit wide vector like this:
+
+```vhdl
+input_1_V : IN STD_LOGIC_VECTOR (895 downto 0);
+```
+
+HLS gives us control over this interface, so we will also use the Standard Scaler HLS as an 'adapter' from an array of 56 inputs to this 896 bit wide input.
+
+This could also be achieved in VHDL with a `generate` statement, or by modifying the `#pragma hls interface` settings of the hls4ml function.
+
+```shell
+cd $MLATL1T_DIR/part3
+python make_scaler_hls.py
+```
+
+This creates the file `$MLATL1T_DIR/part3/Scaler/firmware/hls/scales.h` with contents like:
+
+```c++
+#ifndef L1TMLDEMO_SCALES_H_
+#define L1TMLDEMO_SCALES_H_
+#include "defines.h"
+
+static const scale_t scale[N_INPUTS] = {... scale values};
+static const bias_t bias[N_INPUTS] = {... bias values};
+
+#endif
+```
+
+Then we need to synthesize the scaler HLS:
+
+```shell
+cd $MLATL1T_DIR/part3/Scaler/firmware/hls
+vivado_hls -f synth.tcl
+```
+
+## 2. Setup IPBus Builder workspace
+
+This will clone several Gitlab repositories to the directory `$MLATL1T_DIR/part3/p2fwk-work/`
+
+```shell
+bash setup_fw_workspace.sh
+```
+
+After completion the output of `tree $MLATL1T_DIR/part3/p2fwk-work/ -L 2` should be:
+
+```shell
+$ tree . -L 2
+$MLATL1T_DIR/part3/p2fwk-work/
+├── proj
+└── src
+    ├── cms-tcds2-firmware
+    ├── emp-fwk
+    ├── ipbus-firmware
+    ├── L1TMLDemo
+    ├── legacy_ttc
+    ├── slinkrocket
+    ├── slinkrocket_ips
+    └── tclink
+```
+
+## 3. Create `ipbb` project
+
+```shell
+cd $MLATL1T_DIR/part3/p2fwk-work/
+cp src/emp-fwk/components/ttc/firmware/hdl/ipbus_decode_ipbus_tcds2_interface_accessor.vhd src/cms-tcds2-firmware/components/tcds2_interface/firmware/hdl/
+ipbb proj create vivado L1TMLDemo L1TMLDemo:payload top_serenity.dep
+#ipbb ipbus gendecoders
+cd proj/L1TMLDemo
+ipbb vivado generate-project --single
+```
+
+## 4. Build `ipbb` project
+
+Launch the synthesis and implementation, this will take a few hours!
+
+```shell
+ipbb vivado synth -j8 impl -j8
+ipbb vivado package
+```
+
+## 5. Make a pattern file
+Format some hardware input data to hex-formatted columnar file representing data on optical links.
+This file will be loaded into buffers (BRAMs) next to the transceiver in the FPGA fabric, mimiccing data arriving from CMS or another L1T board.
+
+## 6. Run on Serenity
\ No newline at end of file
--- a/part3/setup_fw_workspace.sh
+++ b/part3/setup_fw_workspace.sh
+pip install https://github.com/ipbus/ipbb/archive/dev/2023a.tar.gz
+ipbb init $MLATL1T_DIR/part3/p2fwk-work
+cd $MLATL1T_DIR/part3/p2fwk-work
+# these are the framework packages required
+ipbb add git https://gitlab.cern.ch/p2-xware/firmware/emp-fwk.git
+ipbb add git https://gitlab.cern.ch/ttc/legacy_ttc.git -b v2.1
+ipbb add git https://gitlab.cern.ch/cms-tcds/cms-tcds2-firmware.git -b v0_1_1
+ipbb add git https://gitlab.cern.ch/HPTD/tclink.git -r fda0bcf
+ipbb add git https://gitlab.cern.ch/dth_p1-v2/slinkrocket_ips.git -b v03.12
+ipbb add git https://gitlab.cern.ch/dth_p1-v2/slinkrocket.git -b v03.12
+ipbb add git https://github.com/ipbus/ipbus-firmware -b v1.9
+
+# this is our project code
+mkdir $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo
+ln -rs $MLATL1T_DIR/part3/firmware/ $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo
+
+# copy the synthesized Neural Network VHDL to the ipbb workspace
+mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/hdl
+mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/cfg
+cp $MLATL1T_DIR/part2_outputs/L1TMLDemo_v1/L1TMLDemo_v1_prj/solution1/syn/vhdl/* $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/hdl/
+# make an ipbb dependency file
+for f in `ls $MLATL1T_DIR/part2_outputs/L1TMLDemo_v1/L1TMLDemo_v1_prj/solution1/syn/vhdl/`
+do
+  echo "src -l NNLib $f" >> $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/NN/firmware/cfg/nn.dep
+done
+
+# copy thr synthesized Standard Scaler VHDL to the ipbb workspace
+mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/hdl
+mkdir -p $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/cfg
+cp $MLATL1T_DIR/part3/firmware/Scaler/hls/scaler_prj/solution1/syn/vhdl/* $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/hdl/
+for f in `ls $MLATL1T_DIR/part3/firmware/Scaler/hls/scaler_prj/solution1/syn/vhdl`
+do
+  echo "src $f" >> $MLATL1T_DIR/part3/p2fwk-work/src/L1TMLDemo/Scaler/firmware/cfg/scaler.dep
+done
\ No newline at end of file