-- ################################################################################################# -- # << NEORV32 - Execute In-Place (XIP) Module >> # -- # ********************************************************************************************* # -- # This module allows the CPU to execute code (and read constant data) directly from an SPI # -- # flash memory. Two host ports are implemented: one for accessing the control and status # -- # registers (mapped to the processor's IO space) and one for the actual instruction/data fetch. # -- # The actual address space mapping of the "instruction/data interface" is done by programming # -- # special control register bits. # -- # ********************************************************************************************* # -- # BSD 3-Clause License # -- # # -- # The NEORV32 RISC-V Processor, https://github.com/stnolting/neorv32 # -- # Copyright (c) 2024, Stephan Nolting. All rights reserved. # -- # # -- # Redistribution and use in source and binary forms, with or without modification, are # -- # permitted provided that the following conditions are met: # -- # # -- # 1. Redistributions of source code must retain the above copyright notice, this list of # -- # conditions and the following disclaimer. # -- # # -- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of # -- # conditions and the following disclaimer in the documentation and/or other materials # -- # provided with the distribution. # -- # # -- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to # -- # endorse or promote products derived from this software without specific prior written # -- # permission. # -- # # -- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS # -- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # -- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # -- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # -- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE # -- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED # -- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # -- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # -- # OF THE POSSIBILITY OF SUCH DAMAGE. # -- ################################################################################################# library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; library neorv32; use neorv32.neorv32_package.all; entity neorv32_xip is generic ( XIP_CACHE_EN : boolean; -- implement XIP cache? XIP_CACHE_NUM_BLOCKS : natural range 1 to 256; -- number of blocks (min 1), has to be a power of 2 XIP_CACHE_BLOCK_SIZE : natural range 1 to 2**16 -- block size in bytes (min 4), has to be a power of 2 ); port ( clk_i : in std_ulogic; -- global clock line rstn_i : in std_ulogic; -- global reset line, low-active bus_req_i : in bus_req_t; -- bus request bus_rsp_o : out bus_rsp_t; -- bus response xip_req_i : in bus_req_t; -- XIP request xip_rsp_o : out bus_rsp_t; -- XIP response clkgen_en_o : out std_ulogic; -- enable clock generator clkgen_i : in std_ulogic_vector(7 downto 0); spi_csn_o : out std_ulogic; -- chip-select, low-active spi_clk_o : out std_ulogic; -- serial clock spi_dat_i : in std_ulogic; -- device data output spi_dat_o : out std_ulogic -- controller data output ); end neorv32_xip; architecture neorv32_xip_rtl of neorv32_xip is -- control register -- constant ctrl_enable_c : natural := 0; -- r/w: module enable constant ctrl_spi_prsc0_c : natural := 1; -- r/w: SPI clock prescaler select - bit 0 constant ctrl_spi_prsc1_c : natural := 2; -- r/w: SPI clock prescaler select - bit 1 constant ctrl_spi_prsc2_c : natural := 3; -- r/w: SPI clock prescaler select - bit 2 constant ctrl_spi_cpol_c : natural := 4; -- r/w: SPI (idle) clock polarity constant ctrl_spi_cpha_c : natural := 5; -- r/w: SPI clock phase constant ctrl_spi_nbytes0_c : natural := 6; -- r/w: SPI number of bytes in transmission (1..9) - bit 0 constant ctrl_spi_nbytes3_c : natural := 9; -- r/w: SPI number of bytes in transmission (1..9) - bit 3 constant ctrl_xip_enable_c : natural := 10; -- r/w: XIP access mode enable constant ctrl_xip_abytes0_c : natural := 11; -- r/w: XIP number of address bytes (0=1,1=2,2=3,3=4) - bit 0 constant ctrl_xip_abytes1_c : natural := 12; -- r/w: XIP number of address bytes (0=1,1=2,2=3,3=4) - bit 1 constant ctrl_rd_cmd0_c : natural := 13; -- r/w: SPI flash read command - bit 0 constant ctrl_rd_cmd7_c : natural := 20; -- r/w: SPI flash read command - bit 7 constant ctrl_spi_csen_c : natural := 21; -- r/w: SPI chip-select enabled constant ctrl_highspeed_c : natural := 22; -- r/w: SPI high-speed mode enable (ignoring ctrl_spi_prsc) constant ctrl_cdiv0_c : natural := 23; -- r/w: clock divider bit 0 constant ctrl_cdiv1_c : natural := 24; -- r/w: clock divider bit 1 constant ctrl_cdiv2_c : natural := 25; -- r/w: clock divider bit 2 constant ctrl_cdiv3_c : natural := 26; -- r/w: clock divider bit 3 -- constant ctrl_burst_en_c : natural := 29; -- r/-: XIP burst mode enable (when cache is implemented) constant ctrl_phy_busy_c : natural := 30; -- r/-: SPI PHY is busy when set constant ctrl_xip_busy_c : natural := 31; -- r/-: XIP access in progress -- signal ctrl : std_ulogic_vector(26 downto 0); -- Direct SPI access registers -- signal spi_data_lo : std_ulogic_vector(31 downto 0); signal spi_data_hi : std_ulogic_vector(31 downto 0); -- write-only! signal spi_trigger : std_ulogic; -- trigger direct SPI operation -- XIP access address -- signal xip_addr : std_ulogic_vector(31 downto 0); -- SPI access fetch arbiter -- type arbiter_state_t is (S_DIRECT, S_IDLE, S_CHECK, S_TRIG, S_BUSY, S_ERROR); type arbiter_t is record state : arbiter_state_t; state_nxt : arbiter_state_t; addr : std_ulogic_vector(31 downto 0); addr_lookahead : std_ulogic_vector(31 downto 0); xip_acc_err : std_ulogic; busy : std_ulogic; tmo_cnt : std_ulogic_vector(2 downto 0); -- timeout counter for auto CS de-assert (burst mode only) end record; signal arbiter : arbiter_t; -- cache access -- signal cache_clear : std_ulogic; signal xip_req : bus_req_t; signal xip_rsp : bus_rsp_t; -- Clock generator -- signal cdiv_cnt : std_ulogic_vector(3 downto 0); signal spi_clk_en : std_ulogic; -- Component: XIP cache -- component neorv32_xip_cache generic ( CACHE_NUM_BLOCKS : natural range 1 to 256; -- number of blocks (min 1), has to be a power of 2 CACHE_BLOCK_SIZE : natural range 1 to 2**16 -- block size in bytes (min 4), has to be a power of 2 ); port ( clk_i : in std_ulogic; -- global clock, rising edge rstn_i : in std_ulogic; -- global reset, low-active, async clear_i : in std_ulogic; -- cache clear cpu_req_i : in bus_req_t; -- request bus cpu_rsp_o : out bus_rsp_t; -- response bus bus_req_o : out bus_req_t; -- request bus bus_rsp_i : in bus_rsp_t -- response bus ); end component; -- Component: SPI PHY -- component neorv32_xip_phy port ( -- global control -- rstn_i : in std_ulogic; -- reset, async, low-active clk_i : in std_ulogic; -- clock spi_clk_en_i : in std_ulogic; -- pre-scaled SPI clock-enable -- operation configuration -- cf_enable_i : in std_ulogic; -- module enable (reset if low) cf_cpha_i : in std_ulogic; -- clock phase cf_cpol_i : in std_ulogic; -- clock idle polarity -- operation control -- op_start_i : in std_ulogic; -- trigger new transmission op_final_i : in std_ulogic; -- end current transmission op_csen_i : in std_ulogic; -- actually enabled device for transmission op_busy_o : out std_ulogic; -- transmission in progress when set op_nbytes_i : in std_ulogic_vector(3 downto 0); -- actual number of bytes to transmit (1..9) op_wdata_i : in std_ulogic_vector(71 downto 0); -- write data op_rdata_o : out std_ulogic_vector(31 downto 0); -- read data -- SPI interface -- spi_csn_o : out std_ulogic; spi_clk_o : out std_ulogic; spi_dat_i : in std_ulogic; spi_dat_o : out std_ulogic ); end component; -- SPI PHY interface -- type phy_if_t is record start : std_ulogic; -- trigger new transmission final : std_ulogic; -- stop current transmission busy : std_ulogic; -- transmission in progress when set wdata : std_ulogic_vector(71 downto 0); -- write data rdata : std_ulogic_vector(31 downto 0); -- read data end record; signal phy_if : phy_if_t; begin -- Control Bus Access --------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- ctrl_bus_access : process(rstn_i, clk_i) begin if (rstn_i = '0') then bus_rsp_o.ack <= '0'; bus_rsp_o.err <= '0'; bus_rsp_o.data <= (others => '0'); ctrl <= (others => '0'); spi_data_lo <= (others => '0'); spi_data_hi <= (others => '0'); spi_trigger <= '0'; elsif rising_edge(clk_i) then -- bus handshake -- bus_rsp_o.ack <= bus_req_i.stb; bus_rsp_o.err <= '0'; bus_rsp_o.data <= (others => '0'); -- defaults -- spi_trigger <= '0'; if (bus_req_i.stb = '1') then -- write access -- if (bus_req_i.rw = '1') then -- control register -- if (bus_req_i.addr(3 downto 2) = "00") then ctrl(ctrl_enable_c) <= bus_req_i.data(ctrl_enable_c); ctrl(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c) <= bus_req_i.data(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c); ctrl(ctrl_spi_cpol_c) <= bus_req_i.data(ctrl_spi_cpol_c); ctrl(ctrl_spi_cpha_c) <= bus_req_i.data(ctrl_spi_cpha_c); ctrl(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c) <= bus_req_i.data(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c); ctrl(ctrl_xip_enable_c) <= bus_req_i.data(ctrl_xip_enable_c); ctrl(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c) <= bus_req_i.data(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c); ctrl(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c) <= bus_req_i.data(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c); ctrl(ctrl_spi_csen_c) <= bus_req_i.data(ctrl_spi_csen_c); ctrl(ctrl_highspeed_c) <= bus_req_i.data(ctrl_highspeed_c); ctrl(ctrl_cdiv3_c downto ctrl_cdiv0_c) <= bus_req_i.data(ctrl_cdiv3_c downto ctrl_cdiv0_c); end if; -- SPI direct data access register lo -- if (bus_req_i.addr(3 downto 2) = "10") then spi_data_lo <= bus_req_i.data; end if; -- SPI direct data access register hi -- if (bus_req_i.addr(3 downto 2) = "11") then spi_data_hi <= bus_req_i.data; spi_trigger <= '1'; -- trigger direct SPI transaction end if; -- read access -- else case bus_req_i.addr(3 downto 2) is when "00" => -- 'xip_ctrl_addr_c' - control register bus_rsp_o.data(ctrl_enable_c) <= ctrl(ctrl_enable_c); bus_rsp_o.data(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c) <= ctrl(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c); bus_rsp_o.data(ctrl_spi_cpol_c) <= ctrl(ctrl_spi_cpol_c); bus_rsp_o.data(ctrl_spi_cpha_c) <= ctrl(ctrl_spi_cpha_c); bus_rsp_o.data(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c) <= ctrl(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c); bus_rsp_o.data(ctrl_xip_enable_c) <= ctrl(ctrl_xip_enable_c); bus_rsp_o.data(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c) <= ctrl(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c); bus_rsp_o.data(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c) <= ctrl(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c); bus_rsp_o.data(ctrl_spi_csen_c) <= ctrl(ctrl_spi_csen_c); bus_rsp_o.data(ctrl_highspeed_c) <= ctrl(ctrl_highspeed_c); bus_rsp_o.data(ctrl_cdiv3_c downto ctrl_cdiv0_c) <= ctrl(ctrl_cdiv3_c downto ctrl_cdiv0_c); -- bus_rsp_o.data(ctrl_burst_en_c) <= bool_to_ulogic_f(XIP_CACHE_EN); bus_rsp_o.data(ctrl_phy_busy_c) <= phy_if.busy; bus_rsp_o.data(ctrl_xip_busy_c) <= arbiter.busy; when "10" => -- 'xip_data_lo_addr_c' - SPI direct data access register lo bus_rsp_o.data <= phy_if.rdata; when others => -- unavailable (not implemented or write-only) bus_rsp_o.data <= (others => '0'); end case; end if; end if; end if; end process ctrl_bus_access; -- XIP Cache ------------------------------------------------------------------------------ -- ------------------------------------------------------------------------------------------- neorv32_xip_cache_inst_true: if XIP_CACHE_EN generate neorv32_xip_cache_inst: neorv32_xip_cache generic map ( CACHE_NUM_BLOCKS => XIP_CACHE_NUM_BLOCKS, CACHE_BLOCK_SIZE => XIP_CACHE_BLOCK_SIZE ) port map ( clk_i => clk_i, rstn_i => rstn_i, clear_i => cache_clear, cpu_req_i => xip_req_i, cpu_rsp_o => xip_rsp_o, bus_req_o => xip_req, bus_rsp_i => xip_rsp ); -- clear cache when entire module or XIP-mode is disabled or on global FENCE operation -- cache_clear <= '1' when (ctrl(ctrl_enable_c) = '0') or (ctrl(ctrl_xip_enable_c) = '0') or (xip_req_i.fence = '1') else '0'; end generate; neorv32_xip_cache_inst_false: if not XIP_CACHE_EN generate xip_req <= xip_req_i; xip_rsp_o <= xip_rsp; end generate; -- XIP Address Computation Logic ---------------------------------------------------------- -- ------------------------------------------------------------------------------------------- xip_access_logic: process(arbiter.addr, ctrl) variable tmp_v : std_ulogic_vector(31 downto 0); begin tmp_v(31 downto 28) := "0000"; tmp_v(27 downto 02) := arbiter.addr(27 downto 02); tmp_v(01 downto 00) := "00"; -- always align to 32-bit boundary; sub-word read accesses are handled by the CPU logic case ctrl(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c) is -- shift address bits to be MSB-aligned when "00" => xip_addr <= tmp_v(07 downto 0) & x"000000"; -- 1 address byte when "01" => xip_addr <= tmp_v(15 downto 0) & x"0000"; -- 2 address bytes when "10" => xip_addr <= tmp_v(23 downto 0) & x"00"; -- 3 address bytes when others => xip_addr <= tmp_v(31 downto 0); -- 4 address bytes end case; end process xip_access_logic; -- SPI Access Arbiter --------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- arbiter_sync: process(rstn_i, clk_i) begin if (rstn_i = '0') then arbiter.state <= S_DIRECT; arbiter.addr <= (others => '0'); arbiter.addr_lookahead <= (others => '0'); arbiter.xip_acc_err <= '0'; arbiter.tmo_cnt <= (others => '0'); elsif rising_edge(clk_i) then -- state control -- if (ctrl(ctrl_enable_c) = '0') or (ctrl(ctrl_xip_enable_c) = '0') then -- sync reset arbiter.state <= S_DIRECT; else arbiter.state <= arbiter.state_nxt; end if; -- address look-ahead -- if (xip_req.stb = '1') and (xip_req.rw = '0') then arbiter.addr <= xip_req.addr; -- buffer address (reducing fan-out on CPU's address net) end if; arbiter.addr_lookahead <= std_ulogic_vector(unsigned(arbiter.addr) + 4); -- prefetch address of *next* linear access -- XIP access error? -- if (arbiter.state = S_DIRECT) then arbiter.xip_acc_err <= xip_req.stb; else arbiter.xip_acc_err <= '0'; end if; -- pending flash access timeout -- if (ctrl(ctrl_enable_c) = '0') or (ctrl(ctrl_xip_enable_c) = '0') or (arbiter.state = S_BUSY) then -- sync reset arbiter.tmo_cnt <= (others => '0'); elsif (arbiter.tmo_cnt(arbiter.tmo_cnt'left) = '0') then -- stop if maximum reached arbiter.tmo_cnt <= std_ulogic_vector(unsigned(arbiter.tmo_cnt) + 1); end if; end if; end process arbiter_sync; -- FSM - combinatorial part -- arbiter_comb: process(arbiter, ctrl, xip_addr, phy_if, xip_req, spi_data_hi, spi_data_lo, spi_trigger) begin -- arbiter defaults -- arbiter.state_nxt <= arbiter.state; -- bus interface defaults -- xip_rsp.data <= (others => '0'); xip_rsp.ack <= '0'; xip_rsp.err <= arbiter.xip_acc_err; -- SPI PHY interface defaults -- phy_if.start <= '0'; phy_if.final <= arbiter.tmo_cnt(arbiter.tmo_cnt'left) or (not bool_to_ulogic_f(XIP_CACHE_EN)); -- terminate if timeout or if burst mode not enabled phy_if.wdata <= ctrl(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c) & xip_addr & x"00000000"; -- MSB-aligned: CMD + address + 32-bit zero data -- fsm -- case arbiter.state is when S_DIRECT => -- XIP access disabled; direct SPI access -- ------------------------------------------------------------ phy_if.wdata <= spi_data_hi & spi_data_lo & x"00"; -- MSB-aligned data phy_if.start <= spi_trigger; phy_if.final <= '1'; -- do not keep CS active after transmission is done arbiter.state_nxt <= S_IDLE; when S_IDLE => -- wait for new bus request -- ------------------------------------------------------------ if (xip_req.stb = '1') then if (xip_req.rw = '0') then arbiter.state_nxt <= S_CHECK; else arbiter.state_nxt <= S_ERROR; end if; end if; when S_CHECK => -- check if we can resume flash access -- ------------------------------------------------------------ if (arbiter.addr(27 downto 2) = arbiter.addr_lookahead(27 downto 2)) and XIP_CACHE_EN and -- access to *next linear* address (arbiter.tmo_cnt(arbiter.tmo_cnt'left) = '0') then -- no "pending access" timeout yet phy_if.start <= '1'; -- resume flash access arbiter.state_nxt <= S_BUSY; else phy_if.final <= '1'; -- restart flash access arbiter.state_nxt <= S_TRIG; end if; when S_TRIG => -- trigger NEW flash read -- ------------------------------------------------------------ phy_if.start <= '1'; arbiter.state_nxt <= S_BUSY; when S_BUSY => -- wait for PHY to complete operation -- ------------------------------------------------------------ xip_rsp.data <= bswap32_f(phy_if.rdata); -- convert incrementing byte-read to little-endian if (phy_if.busy = '0') then xip_rsp.ack <= '1'; arbiter.state_nxt <= S_IDLE; end if; when S_ERROR => -- access error -- ------------------------------------------------------------ xip_rsp.err <= '1'; arbiter.state_nxt <= S_IDLE; when others => -- undefined -- ------------------------------------------------------------ arbiter.state_nxt <= S_IDLE; end case; end process arbiter_comb; -- arbiter status -- arbiter.busy <= '1' when (arbiter.state = S_TRIG) or (arbiter.state = S_BUSY) else '0'; -- actual XIP access in progress -- SPI Clock Generator -------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- clock_generator: process(rstn_i, clk_i) begin if (rstn_i = '0') then spi_clk_en <= '0'; cdiv_cnt <= (others => '0'); elsif rising_edge(clk_i) then spi_clk_en <= '0'; -- default if (ctrl(ctrl_enable_c) = '0') then -- reset/disabled cdiv_cnt <= (others => '0'); elsif (clkgen_i(to_integer(unsigned(ctrl(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c)))) = '1') or (ctrl(ctrl_highspeed_c) = '1') then -- pre-scaled clock if (cdiv_cnt = ctrl(ctrl_cdiv3_c downto ctrl_cdiv0_c)) then -- clock divider for fine-tuning spi_clk_en <= '1'; cdiv_cnt <= (others => '0'); else cdiv_cnt <= std_ulogic_vector(unsigned(cdiv_cnt) + 1); end if; end if; end if; end process clock_generator; -- enable clock generator -- clkgen_en_o <= ctrl(ctrl_enable_c); -- SPI Physical Interface ----------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- neorv32_xip_phy_inst: neorv32_xip_phy port map ( -- global control -- rstn_i => rstn_i, clk_i => clk_i, spi_clk_en_i => spi_clk_en, -- operation configuration -- cf_enable_i => ctrl(ctrl_enable_c), -- module enable (reset if low) cf_cpha_i => ctrl(ctrl_spi_cpha_c), -- clock phase cf_cpol_i => ctrl(ctrl_spi_cpol_c), -- clock idle polarity -- operation control -- op_start_i => phy_if.start, -- trigger new transmission op_final_i => phy_if.final, -- end current transmission op_csen_i => ctrl(ctrl_spi_csen_c), -- actually enabled device for transmission op_busy_o => phy_if.busy, -- transmission in progress when set op_nbytes_i => ctrl(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c), -- actual number of bytes to transmit op_wdata_i => phy_if.wdata, -- write data op_rdata_o => phy_if.rdata, -- read data -- SPI interface -- spi_csn_o => spi_csn_o, spi_clk_o => spi_clk_o, spi_dat_i => spi_dat_i, spi_dat_o => spi_dat_o ); end neorv32_xip_rtl; -- ############################################################################################################################ -- ############################################################################################################################ -- ################################################################################################# -- # << NEORV32 - XIP Module - SPI Physical Interface >> # -- # ********************************************************************************************* # -- # BSD 3-Clause License # -- # # -- # The NEORV32 RISC-V Processor, https://github.com/stnolting/neorv32 # -- # Copyright (c) 2024, Stephan Nolting. All rights reserved. # -- # # -- # Redistribution and use in source and binary forms, with or without modification, are # -- # permitted provided that the following conditions are met: # -- # # -- # 1. Redistributions of source code must retain the above copyright notice, this list of # -- # conditions and the following disclaimer. # -- # # -- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of # -- # conditions and the following disclaimer in the documentation and/or other materials # -- # provided with the distribution. # -- # # -- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to # -- # endorse or promote products derived from this software without specific prior written # -- # permission. # -- # # -- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS # -- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # -- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # -- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # -- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE # -- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED # -- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # -- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # -- # OF THE POSSIBILITY OF SUCH DAMAGE. # -- ################################################################################################# library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; library neorv32; use neorv32.neorv32_package.all; entity neorv32_xip_phy is port ( -- global control -- rstn_i : in std_ulogic; -- reset, async, low-active clk_i : in std_ulogic; -- clock spi_clk_en_i : in std_ulogic; -- pre-scaled SPI clock-enable -- operation configuration -- cf_enable_i : in std_ulogic; -- module enable (reset if low) cf_cpha_i : in std_ulogic; -- clock phase cf_cpol_i : in std_ulogic; -- clock idle polarity -- operation control -- op_start_i : in std_ulogic; -- trigger new transmission op_final_i : in std_ulogic; -- end current transmission op_csen_i : in std_ulogic; -- actually enabled device for transmission op_busy_o : out std_ulogic; -- transmission in progress when set op_nbytes_i : in std_ulogic_vector(03 downto 0); -- actual number of bytes to transmit (1..9) op_wdata_i : in std_ulogic_vector(71 downto 0); -- write data op_rdata_o : out std_ulogic_vector(31 downto 0); -- read data -- SPI interface -- spi_csn_o : out std_ulogic; spi_clk_o : out std_ulogic; spi_dat_i : in std_ulogic; spi_dat_o : out std_ulogic ); end neorv32_xip_phy; architecture neorv32_xip_phy_rtl of neorv32_xip_phy is -- serial engine -- type ctrl_state_t is (S_IDLE, S_WAIT, S_START, S_SYNC, S_RTX_A, S_RTX_B, S_DONE); type ctrl_t is record state : ctrl_state_t; sreg : std_ulogic_vector(71 downto 0); -- only the lowest 32-bit are used as RX data bitcnt : std_ulogic_vector(06 downto 0); di_sync : std_ulogic; csen : std_ulogic; end record; signal ctrl : ctrl_t; begin -- Serial Interface Engine ---------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- serial_engine: process(rstn_i, clk_i) begin if (rstn_i = '0') then spi_clk_o <= '0'; spi_csn_o <= '1'; ctrl.state <= S_IDLE; ctrl.csen <= '0'; ctrl.sreg <= (others => '0'); ctrl.bitcnt <= (others => '0'); ctrl.di_sync <= '0'; elsif rising_edge(clk_i) then if (cf_enable_i = '0') then -- sync reset spi_clk_o <= '0'; spi_csn_o <= '1'; ctrl.state <= S_IDLE; ctrl.csen <= '0'; ctrl.sreg <= (others => '0'); ctrl.bitcnt <= (others => '0'); ctrl.di_sync <= '0'; else -- fsm case ctrl.state is when S_IDLE => -- wait for new transmission trigger -- ------------------------------------------------------------ spi_csn_o <= '1'; -- flash disabled spi_clk_o <= cf_cpol_i; ctrl.bitcnt <= op_nbytes_i & "000"; -- number of bytes ctrl.csen <= op_csen_i; if (op_start_i = '1') then ctrl.state <= S_START; end if; when S_START => -- start of transmission (keep current spi_csn_o state!) -- ------------------------------------------------------------ ctrl.sreg <= op_wdata_i; if (spi_clk_en_i = '1') then ctrl.state <= S_SYNC; end if; when S_WAIT => -- wait for resume transmission trigger -- ------------------------------------------------------------ spi_csn_o <= not ctrl.csen; -- keep CS active ctrl.bitcnt <= "0100000"; -- 4 bytes = 32-bit read data if (op_final_i = '1') then -- terminate pending flash access ctrl.state <= S_IDLE; elsif (op_start_i = '1') then -- resume flash access ctrl.state <= S_SYNC; end if; when S_SYNC => -- synchronize SPI clock -- ------------------------------------------------------------ spi_csn_o <= not ctrl.csen; -- enable flash if (spi_clk_en_i = '1') then if (cf_cpha_i = '1') then -- clock phase shift spi_clk_o <= not cf_cpol_i; end if; ctrl.state <= S_RTX_A; end if; when S_RTX_A => -- first half of bit transmission -- ------------------------------------------------------------ if (spi_clk_en_i = '1') then spi_clk_o <= not (cf_cpha_i xor cf_cpol_i); ctrl.di_sync <= spi_dat_i; ctrl.bitcnt <= std_ulogic_vector(unsigned(ctrl.bitcnt) - 1); ctrl.state <= S_RTX_B; end if; when S_RTX_B => -- second half of bit transmission -- ------------------------------------------------------------ if (spi_clk_en_i = '1') then ctrl.sreg <= ctrl.sreg(ctrl.sreg'left-1 downto 0) & ctrl.di_sync; if (or_reduce_f(ctrl.bitcnt) = '0') then -- all bits transferred? spi_clk_o <= cf_cpol_i; ctrl.state <= S_DONE; -- transmission done else spi_clk_o <= cf_cpha_i xor cf_cpol_i; ctrl.state <= S_RTX_A; -- next bit end if; end if; when S_DONE => -- transmission done -- ------------------------------------------------------------ if (spi_clk_en_i = '1') then ctrl.state <= S_WAIT; end if; when others => -- undefined -- ------------------------------------------------------------ ctrl.state <= S_IDLE; end case; end if; end if; end process serial_engine; -- serial unit busy -- op_busy_o <= '0' when (ctrl.state = S_IDLE) or (ctrl.state = S_WAIT) else '1'; -- serial data output -- spi_dat_o <= ctrl.sreg(ctrl.sreg'left); -- RX data -- op_rdata_o <= ctrl.sreg(31 downto 0); end neorv32_xip_phy_rtl; -- ############################################################################################################################ -- ############################################################################################################################ -- ################################################################################################# -- # << NEORV32 - XIP Cache >> # -- # ********************************************************************************************* # -- # Simple directed-mapped read-only cache to accelerate XIP (SPI) flash accesses. # -- # ********************************************************************************************* # -- # BSD 3-Clause License # -- # # -- # The NEORV32 RISC-V Processor, https://github.com/stnolting/neorv32 # -- # Copyright (c) 2024, Stephan Nolting. All rights reserved. # -- # # -- # Redistribution and use in source and binary forms, with or without modification, are # -- # permitted provided that the following conditions are met: # -- # # -- # 1. Redistributions of source code must retain the above copyright notice, this list of # -- # conditions and the following disclaimer. # -- # # -- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of # -- # conditions and the following disclaimer in the documentation and/or other materials # -- # provided with the distribution. # -- # # -- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to # -- # endorse or promote products derived from this software without specific prior written # -- # permission. # -- # # -- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS # -- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # -- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # -- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # -- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE # -- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED # -- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # -- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # -- # OF THE POSSIBILITY OF SUCH DAMAGE. # -- ################################################################################################# library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; library neorv32; use neorv32.neorv32_package.all; entity neorv32_xip_cache is generic ( CACHE_NUM_BLOCKS : natural range 1 to 256; -- number of blocks (min 1), has to be a power of 2 CACHE_BLOCK_SIZE : natural range 1 to 2**16 -- block size in bytes (min 4), has to be a power of 2 ); port ( clk_i : in std_ulogic; -- global clock, rising edge rstn_i : in std_ulogic; -- global reset, low-active, async clear_i : in std_ulogic; -- cache clear cpu_req_i : in bus_req_t; -- request bus cpu_rsp_o : out bus_rsp_t; -- response bus bus_req_o : out bus_req_t; -- request bus bus_rsp_i : in bus_rsp_t -- response bus ); end neorv32_xip_cache; architecture neorv32_xip_cache_rtl of neorv32_xip_cache is -- auto configuration -- constant block_num_c : natural := cond_sel_natural_f(is_power_of_two_f(CACHE_NUM_BLOCKS), CACHE_NUM_BLOCKS, 2**index_size_f(CACHE_NUM_BLOCKS)); constant block_size_c : natural := cond_sel_natural_f(is_power_of_two_f(CACHE_BLOCK_SIZE), CACHE_BLOCK_SIZE, 2**index_size_f(CACHE_BLOCK_SIZE)); constant offset_size_c : natural := index_size_f(block_size_c/4); -- offset addresses full 32-bit words -- cache layout -- constant index_size_c : natural := index_size_f(block_num_c); constant tag_size_c : natural := 32 - (offset_size_c + index_size_c + 2); -- 2 additional bits for byte offset constant entries_c : natural := block_num_c * (block_size_c/4); -- number of 32-bit entries (per set) -- cache interface -- type cache_if_t is record host_rdata : std_ulogic_vector(31 downto 0); -- cpu read data host_rderr : std_ulogic; -- cpu read error hit : std_ulogic; -- hit access ctrl_en : std_ulogic; -- control access enable ctrl_we : std_ulogic; -- control write enable end record; signal cache : cache_if_t; -- control engine -- type ctrl_engine_state_t is (S_IDLE, S_CHECK, S_DOWNLOAD_REQ, S_DOWNLOAD_GET, S_RESYNC, S_ERROR); signal state, state_nxt : ctrl_engine_state_t; -- FSM state signal addr_reg, addr_reg_nxt : std_ulogic_vector(31 downto 0); -- address register for block download -- cache memory -- type tag_mem_t is array (0 to block_num_c-1) of std_ulogic_vector(tag_size_c-1 downto 0); type data_mem_t is array (0 to entries_c-1) of std_ulogic_vector(31+1 downto 0); -- data word + ERR status signal tag_mem : tag_mem_t; signal data_mem : data_mem_t; signal tag_rd : std_ulogic_vector(tag_size_c-1 downto 0); -- tag read data signal data_rd : std_ulogic_vector(31+1 downto 0); -- data word + ERR status signal valid_mem : std_ulogic_vector(block_num_c-1 downto 0); signal valid_rd : std_ulogic; -- valid flag read data -- access address decomposition -- type acc_addr_t is record tag : std_ulogic_vector(tag_size_c-1 downto 0); index : std_ulogic_vector(index_size_c-1 downto 0); offset : std_ulogic_vector(offset_size_c-1 downto 0); end record; signal host_acc, ctrl_acc : acc_addr_t; -- cache data memory access -- signal cache_index : std_ulogic_vector(index_size_c-1 downto 0); signal cache_offset : std_ulogic_vector(offset_size_c-1 downto 0); signal cache_addr : std_ulogic_vector((index_size_c+offset_size_c)-1 downto 0); -- index & offset begin -- Control Engine FSM Sync ---------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- ctrl_engine_fsm_sync: process(rstn_i, clk_i) begin if (rstn_i = '0') then state <= S_IDLE; addr_reg <= (others => '0'); elsif rising_edge(clk_i) then state <= state_nxt; addr_reg <= addr_reg_nxt; end if; end process ctrl_engine_fsm_sync; -- Control Engine FSM Comb ---------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- ctrl_engine_fsm_comb: process(state, addr_reg, cache, clear_i, cpu_req_i, bus_rsp_i) begin -- control defaults -- state_nxt <= state; addr_reg_nxt <= addr_reg; -- cache defaults -- cache.ctrl_en <= '0'; cache.ctrl_we <= '0'; -- host response defaults -- cpu_rsp_o.ack <= '0'; cpu_rsp_o.err <= '0'; cpu_rsp_o.data <= (others => '0'); -- bus interface defaults -- bus_req_o.data <= (others => '0'); bus_req_o.ben <= (others => '0'); bus_req_o.src <= cpu_req_i.src; bus_req_o.priv <= cpu_req_i.priv; bus_req_o.addr <= addr_reg; bus_req_o.rw <= '0'; -- read-only bus_req_o.stb <= '0'; bus_req_o.rvso <= cpu_req_i.rvso; bus_req_o.fence <= cpu_req_i.fence; -- fsm -- case state is when S_IDLE => -- wait for host access request or cache control operation -- ------------------------------------------------------------ if (cpu_req_i.stb = '1') then if (cpu_req_i.rw = '1') or (clear_i = '1') then -- write access or cache being cleared state_nxt <= S_ERROR; else -- actual cache access state_nxt <= S_CHECK; end if; end if; when S_CHECK => -- finalize host access if cache hit -- ------------------------------------------------------------ -- calculate block base address in case we need to download it -- addr_reg_nxt <= cpu_req_i.addr; addr_reg_nxt((offset_size_c+2)-1 downto 0) <= (others => '0'); -- block-aligned -- cpu_rsp_o.data <= cache.host_rdata; -- output read data in case we have a hit if (cache.hit = '1') then -- cache HIT cpu_rsp_o.err <= cache.host_rderr; cpu_rsp_o.ack <= not cache.host_rderr; state_nxt <= S_IDLE; else -- cache MISS state_nxt <= S_DOWNLOAD_REQ; end if; when S_DOWNLOAD_REQ => -- download new cache block: request new word -- ------------------------------------------------------------ bus_req_o.stb <= '1'; -- request new read transfer state_nxt <= S_DOWNLOAD_GET; when S_DOWNLOAD_GET => -- download new cache block: wait for bus response -- ------------------------------------------------------------ cache.ctrl_en <= '1'; -- cache update operation if (bus_rsp_i.ack = '1') or (bus_rsp_i.err = '1') then -- ACK or ERROR = write to cache and get next word (store ERROR flag in cache) cache.ctrl_we <= '1'; -- write to cache if (and_reduce_f(addr_reg((offset_size_c+2)-1 downto 2)) = '1') then -- block complete? state_nxt <= S_RESYNC; else -- get next word addr_reg_nxt <= std_ulogic_vector(unsigned(addr_reg) + 4); state_nxt <= S_DOWNLOAD_REQ; end if; end if; when S_RESYNC => -- re-sync host/cache access: cache read-latency dummy cycle -- ------------------------------------------------------------ state_nxt <= S_CHECK; when others => -- S_ERROR: error -- ------------------------------------------------------------ cpu_rsp_o.err <= '1'; state_nxt <= S_IDLE; end case; end process ctrl_engine_fsm_comb; -- Access Address Decomposition ----------------------------------------------------------- -- ------------------------------------------------------------------------------------------- host_acc.tag <= cpu_req_i.addr(31 downto 31-(tag_size_c-1)); host_acc.index <= cpu_req_i.addr(31-tag_size_c downto 2+offset_size_c); host_acc.offset <= cpu_req_i.addr(2+(offset_size_c-1) downto 2); -- discard byte offset ctrl_acc.tag <= addr_reg(31 downto 31-(tag_size_c-1)); ctrl_acc.index <= addr_reg(31-tag_size_c downto 2+offset_size_c); ctrl_acc.offset <= addr_reg(2+(offset_size_c-1) downto 2); -- discard byte offset -- Status Flag Memory --------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- status_memory: process(rstn_i, clk_i) -- single-port RAM begin if (rstn_i = '0') then valid_mem <= (others => '0'); valid_rd <= '0'; elsif rising_edge(clk_i) then if (clear_i = '1') then -- invalidate cache valid_mem <= (others => '0'); elsif (cache.ctrl_we = '1') then -- make current block valid valid_mem(to_integer(unsigned(cache_index))) <= '1'; end if; valid_rd <= valid_mem(to_integer(unsigned(cache_index))); end if; end process status_memory; -- Cache Data Memory ---------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- cache_memory: process(clk_i) -- single-port RAM begin if rising_edge(clk_i) then -- no reset to allow mapping to blockRAM if (cache.ctrl_we = '1') then -- update cache block data_mem(to_integer(unsigned(cache_addr))) <= bus_rsp_i.err & bus_rsp_i.data; tag_mem(to_integer(unsigned(cache_index))) <= ctrl_acc.tag; end if; data_rd <= data_mem(to_integer(unsigned(cache_addr))); tag_rd <= tag_mem(to_integer(unsigned(cache_index))); end if; end process cache_memory; -- cache access select -- cache_index <= host_acc.index when (cache.ctrl_en = '0') else ctrl_acc.index; cache_offset <= host_acc.offset when (cache.ctrl_en = '0') else ctrl_acc.offset; cache_addr <= cache_index & cache_offset; -- resulting ram access address -- hit = tag match and valid entry -- cache.hit <= '1' when (host_acc.tag = tag_rd) and (valid_rd = '1') else '0'; -- data output -- cache.host_rdata <= data_rd(31 downto 0); cache.host_rderr <= data_rd(32); end neorv32_xip_cache_rtl;