neorv32/rtl/core/neorv32_xip.vhd

950 lines
46 KiB
VHDL

-- #################################################################################################
-- # << NEORV32 - Execute In-Place (XIP) Module >> #
-- # ********************************************************************************************* #
-- # This module allows the CPU to execute code (and read constant data) directly from an SPI #
-- # flash memory. Two host ports are implemented: one for accessing the control and status #
-- # registers (mapped to the processor's IO space) and one for the actual instruction/data fetch. #
-- # The actual address space mapping of the "instruction/data interface" is done by programming #
-- # special control register bits. #
-- # ********************************************************************************************* #
-- # BSD 3-Clause License #
-- # #
-- # The NEORV32 RISC-V Processor, https://github.com/stnolting/neorv32 #
-- # Copyright (c) 2024, Stephan Nolting. All rights reserved. #
-- # #
-- # Redistribution and use in source and binary forms, with or without modification, are #
-- # permitted provided that the following conditions are met: #
-- # #
-- # 1. Redistributions of source code must retain the above copyright notice, this list of #
-- # conditions and the following disclaimer. #
-- # #
-- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of #
-- # conditions and the following disclaimer in the documentation and/or other materials #
-- # provided with the distribution. #
-- # #
-- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to #
-- # endorse or promote products derived from this software without specific prior written #
-- # permission. #
-- # #
-- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS #
-- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF #
-- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE #
-- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
-- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE #
-- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED #
-- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
-- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED #
-- # OF THE POSSIBILITY OF SUCH DAMAGE. #
-- #################################################################################################
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
library neorv32;
use neorv32.neorv32_package.all;
entity neorv32_xip is
generic (
XIP_CACHE_EN : boolean; -- implement XIP cache?
XIP_CACHE_NUM_BLOCKS : natural range 1 to 256; -- number of blocks (min 1), has to be a power of 2
XIP_CACHE_BLOCK_SIZE : natural range 1 to 2**16 -- block size in bytes (min 4), has to be a power of 2
);
port (
clk_i : in std_ulogic; -- global clock line
rstn_i : in std_ulogic; -- global reset line, low-active
bus_req_i : in bus_req_t; -- bus request
bus_rsp_o : out bus_rsp_t; -- bus response
xip_req_i : in bus_req_t; -- XIP request
xip_rsp_o : out bus_rsp_t; -- XIP response
clkgen_en_o : out std_ulogic; -- enable clock generator
clkgen_i : in std_ulogic_vector(7 downto 0);
spi_csn_o : out std_ulogic; -- chip-select, low-active
spi_clk_o : out std_ulogic; -- serial clock
spi_dat_i : in std_ulogic; -- device data output
spi_dat_o : out std_ulogic -- controller data output
);
end neorv32_xip;
architecture neorv32_xip_rtl of neorv32_xip is
-- control register --
constant ctrl_enable_c : natural := 0; -- r/w: module enable
constant ctrl_spi_prsc0_c : natural := 1; -- r/w: SPI clock prescaler select - bit 0
constant ctrl_spi_prsc1_c : natural := 2; -- r/w: SPI clock prescaler select - bit 1
constant ctrl_spi_prsc2_c : natural := 3; -- r/w: SPI clock prescaler select - bit 2
constant ctrl_spi_cpol_c : natural := 4; -- r/w: SPI (idle) clock polarity
constant ctrl_spi_cpha_c : natural := 5; -- r/w: SPI clock phase
constant ctrl_spi_nbytes0_c : natural := 6; -- r/w: SPI number of bytes in transmission (1..9) - bit 0
constant ctrl_spi_nbytes3_c : natural := 9; -- r/w: SPI number of bytes in transmission (1..9) - bit 3
constant ctrl_xip_enable_c : natural := 10; -- r/w: XIP access mode enable
constant ctrl_xip_abytes0_c : natural := 11; -- r/w: XIP number of address bytes (0=1,1=2,2=3,3=4) - bit 0
constant ctrl_xip_abytes1_c : natural := 12; -- r/w: XIP number of address bytes (0=1,1=2,2=3,3=4) - bit 1
constant ctrl_rd_cmd0_c : natural := 13; -- r/w: SPI flash read command - bit 0
constant ctrl_rd_cmd7_c : natural := 20; -- r/w: SPI flash read command - bit 7
constant ctrl_spi_csen_c : natural := 21; -- r/w: SPI chip-select enabled
constant ctrl_highspeed_c : natural := 22; -- r/w: SPI high-speed mode enable (ignoring ctrl_spi_prsc)
constant ctrl_cdiv0_c : natural := 23; -- r/w: clock divider bit 0
constant ctrl_cdiv1_c : natural := 24; -- r/w: clock divider bit 1
constant ctrl_cdiv2_c : natural := 25; -- r/w: clock divider bit 2
constant ctrl_cdiv3_c : natural := 26; -- r/w: clock divider bit 3
--
constant ctrl_burst_en_c : natural := 29; -- r/-: XIP burst mode enable (when cache is implemented)
constant ctrl_phy_busy_c : natural := 30; -- r/-: SPI PHY is busy when set
constant ctrl_xip_busy_c : natural := 31; -- r/-: XIP access in progress
--
signal ctrl : std_ulogic_vector(26 downto 0);
-- Direct SPI access registers --
signal spi_data_lo : std_ulogic_vector(31 downto 0);
signal spi_data_hi : std_ulogic_vector(31 downto 0); -- write-only!
signal spi_trigger : std_ulogic; -- trigger direct SPI operation
-- XIP access address --
signal xip_addr : std_ulogic_vector(31 downto 0);
-- SPI access fetch arbiter --
type arbiter_state_t is (S_DIRECT, S_IDLE, S_CHECK, S_TRIG, S_BUSY, S_ERROR);
type arbiter_t is record
state : arbiter_state_t;
state_nxt : arbiter_state_t;
addr : std_ulogic_vector(31 downto 0);
addr_lookahead : std_ulogic_vector(31 downto 0);
xip_acc_err : std_ulogic;
busy : std_ulogic;
tmo_cnt : std_ulogic_vector(2 downto 0); -- timeout counter for auto CS de-assert (burst mode only)
end record;
signal arbiter : arbiter_t;
-- cache access --
signal cache_clear : std_ulogic;
signal xip_req : bus_req_t;
signal xip_rsp : bus_rsp_t;
-- Clock generator --
signal cdiv_cnt : std_ulogic_vector(3 downto 0);
signal spi_clk_en : std_ulogic;
-- Component: XIP cache --
component neorv32_xip_cache
generic (
CACHE_NUM_BLOCKS : natural range 1 to 256; -- number of blocks (min 1), has to be a power of 2
CACHE_BLOCK_SIZE : natural range 1 to 2**16 -- block size in bytes (min 4), has to be a power of 2
);
port (
clk_i : in std_ulogic; -- global clock, rising edge
rstn_i : in std_ulogic; -- global reset, low-active, async
clear_i : in std_ulogic; -- cache clear
cpu_req_i : in bus_req_t; -- request bus
cpu_rsp_o : out bus_rsp_t; -- response bus
bus_req_o : out bus_req_t; -- request bus
bus_rsp_i : in bus_rsp_t -- response bus
);
end component;
-- Component: SPI PHY --
component neorv32_xip_phy
port (
-- global control --
rstn_i : in std_ulogic; -- reset, async, low-active
clk_i : in std_ulogic; -- clock
spi_clk_en_i : in std_ulogic; -- pre-scaled SPI clock-enable
-- operation configuration --
cf_enable_i : in std_ulogic; -- module enable (reset if low)
cf_cpha_i : in std_ulogic; -- clock phase
cf_cpol_i : in std_ulogic; -- clock idle polarity
-- operation control --
op_start_i : in std_ulogic; -- trigger new transmission
op_final_i : in std_ulogic; -- end current transmission
op_csen_i : in std_ulogic; -- actually enabled device for transmission
op_busy_o : out std_ulogic; -- transmission in progress when set
op_nbytes_i : in std_ulogic_vector(3 downto 0); -- actual number of bytes to transmit (1..9)
op_wdata_i : in std_ulogic_vector(71 downto 0); -- write data
op_rdata_o : out std_ulogic_vector(31 downto 0); -- read data
-- SPI interface --
spi_csn_o : out std_ulogic;
spi_clk_o : out std_ulogic;
spi_dat_i : in std_ulogic;
spi_dat_o : out std_ulogic
);
end component;
-- SPI PHY interface --
type phy_if_t is record
start : std_ulogic; -- trigger new transmission
final : std_ulogic; -- stop current transmission
busy : std_ulogic; -- transmission in progress when set
wdata : std_ulogic_vector(71 downto 0); -- write data
rdata : std_ulogic_vector(31 downto 0); -- read data
end record;
signal phy_if : phy_if_t;
begin
-- Control Bus Access ---------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
ctrl_bus_access : process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
bus_rsp_o.ack <= '0';
bus_rsp_o.err <= '0';
bus_rsp_o.data <= (others => '0');
ctrl <= (others => '0');
spi_data_lo <= (others => '0');
spi_data_hi <= (others => '0');
spi_trigger <= '0';
elsif rising_edge(clk_i) then
-- bus handshake --
bus_rsp_o.ack <= bus_req_i.stb;
bus_rsp_o.err <= '0';
bus_rsp_o.data <= (others => '0');
-- defaults --
spi_trigger <= '0';
if (bus_req_i.stb = '1') then
-- write access --
if (bus_req_i.rw = '1') then
-- control register --
if (bus_req_i.addr(3 downto 2) = "00") then
ctrl(ctrl_enable_c) <= bus_req_i.data(ctrl_enable_c);
ctrl(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c) <= bus_req_i.data(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c);
ctrl(ctrl_spi_cpol_c) <= bus_req_i.data(ctrl_spi_cpol_c);
ctrl(ctrl_spi_cpha_c) <= bus_req_i.data(ctrl_spi_cpha_c);
ctrl(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c) <= bus_req_i.data(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c);
ctrl(ctrl_xip_enable_c) <= bus_req_i.data(ctrl_xip_enable_c);
ctrl(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c) <= bus_req_i.data(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c);
ctrl(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c) <= bus_req_i.data(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c);
ctrl(ctrl_spi_csen_c) <= bus_req_i.data(ctrl_spi_csen_c);
ctrl(ctrl_highspeed_c) <= bus_req_i.data(ctrl_highspeed_c);
ctrl(ctrl_cdiv3_c downto ctrl_cdiv0_c) <= bus_req_i.data(ctrl_cdiv3_c downto ctrl_cdiv0_c);
end if;
-- SPI direct data access register lo --
if (bus_req_i.addr(3 downto 2) = "10") then
spi_data_lo <= bus_req_i.data;
end if;
-- SPI direct data access register hi --
if (bus_req_i.addr(3 downto 2) = "11") then
spi_data_hi <= bus_req_i.data;
spi_trigger <= '1'; -- trigger direct SPI transaction
end if;
-- read access --
else
case bus_req_i.addr(3 downto 2) is
when "00" => -- 'xip_ctrl_addr_c' - control register
bus_rsp_o.data(ctrl_enable_c) <= ctrl(ctrl_enable_c);
bus_rsp_o.data(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c) <= ctrl(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c);
bus_rsp_o.data(ctrl_spi_cpol_c) <= ctrl(ctrl_spi_cpol_c);
bus_rsp_o.data(ctrl_spi_cpha_c) <= ctrl(ctrl_spi_cpha_c);
bus_rsp_o.data(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c) <= ctrl(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c);
bus_rsp_o.data(ctrl_xip_enable_c) <= ctrl(ctrl_xip_enable_c);
bus_rsp_o.data(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c) <= ctrl(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c);
bus_rsp_o.data(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c) <= ctrl(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c);
bus_rsp_o.data(ctrl_spi_csen_c) <= ctrl(ctrl_spi_csen_c);
bus_rsp_o.data(ctrl_highspeed_c) <= ctrl(ctrl_highspeed_c);
bus_rsp_o.data(ctrl_cdiv3_c downto ctrl_cdiv0_c) <= ctrl(ctrl_cdiv3_c downto ctrl_cdiv0_c);
--
bus_rsp_o.data(ctrl_burst_en_c) <= bool_to_ulogic_f(XIP_CACHE_EN);
bus_rsp_o.data(ctrl_phy_busy_c) <= phy_if.busy;
bus_rsp_o.data(ctrl_xip_busy_c) <= arbiter.busy;
when "10" => -- 'xip_data_lo_addr_c' - SPI direct data access register lo
bus_rsp_o.data <= phy_if.rdata;
when others => -- unavailable (not implemented or write-only)
bus_rsp_o.data <= (others => '0');
end case;
end if;
end if;
end if;
end process ctrl_bus_access;
-- XIP Cache ------------------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
neorv32_xip_cache_inst_true:
if XIP_CACHE_EN generate
neorv32_xip_cache_inst: neorv32_xip_cache
generic map (
CACHE_NUM_BLOCKS => XIP_CACHE_NUM_BLOCKS,
CACHE_BLOCK_SIZE => XIP_CACHE_BLOCK_SIZE
)
port map (
clk_i => clk_i,
rstn_i => rstn_i,
clear_i => cache_clear,
cpu_req_i => xip_req_i,
cpu_rsp_o => xip_rsp_o,
bus_req_o => xip_req,
bus_rsp_i => xip_rsp
);
-- clear cache when entire module or XIP-mode is disabled or on global FENCE operation --
cache_clear <= '1' when (ctrl(ctrl_enable_c) = '0') or (ctrl(ctrl_xip_enable_c) = '0') or (xip_req_i.fence = '1') else '0';
end generate;
neorv32_xip_cache_inst_false:
if not XIP_CACHE_EN generate
xip_req <= xip_req_i;
xip_rsp_o <= xip_rsp;
end generate;
-- XIP Address Computation Logic ----------------------------------------------------------
-- -------------------------------------------------------------------------------------------
xip_access_logic: process(arbiter.addr, ctrl)
variable tmp_v : std_ulogic_vector(31 downto 0);
begin
tmp_v(31 downto 28) := "0000";
tmp_v(27 downto 02) := arbiter.addr(27 downto 02);
tmp_v(01 downto 00) := "00"; -- always align to 32-bit boundary; sub-word read accesses are handled by the CPU logic
case ctrl(ctrl_xip_abytes1_c downto ctrl_xip_abytes0_c) is -- shift address bits to be MSB-aligned
when "00" => xip_addr <= tmp_v(07 downto 0) & x"000000"; -- 1 address byte
when "01" => xip_addr <= tmp_v(15 downto 0) & x"0000"; -- 2 address bytes
when "10" => xip_addr <= tmp_v(23 downto 0) & x"00"; -- 3 address bytes
when others => xip_addr <= tmp_v(31 downto 0); -- 4 address bytes
end case;
end process xip_access_logic;
-- SPI Access Arbiter ---------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
arbiter_sync: process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
arbiter.state <= S_DIRECT;
arbiter.addr <= (others => '0');
arbiter.addr_lookahead <= (others => '0');
arbiter.xip_acc_err <= '0';
arbiter.tmo_cnt <= (others => '0');
elsif rising_edge(clk_i) then
-- state control --
if (ctrl(ctrl_enable_c) = '0') or (ctrl(ctrl_xip_enable_c) = '0') then -- sync reset
arbiter.state <= S_DIRECT;
else
arbiter.state <= arbiter.state_nxt;
end if;
-- address look-ahead --
if (xip_req.stb = '1') and (xip_req.rw = '0') then
arbiter.addr <= xip_req.addr; -- buffer address (reducing fan-out on CPU's address net)
end if;
arbiter.addr_lookahead <= std_ulogic_vector(unsigned(arbiter.addr) + 4); -- prefetch address of *next* linear access
-- XIP access error? --
if (arbiter.state = S_DIRECT) then
arbiter.xip_acc_err <= xip_req.stb;
else
arbiter.xip_acc_err <= '0';
end if;
-- pending flash access timeout --
if (ctrl(ctrl_enable_c) = '0') or (ctrl(ctrl_xip_enable_c) = '0') or (arbiter.state = S_BUSY) then -- sync reset
arbiter.tmo_cnt <= (others => '0');
elsif (arbiter.tmo_cnt(arbiter.tmo_cnt'left) = '0') then -- stop if maximum reached
arbiter.tmo_cnt <= std_ulogic_vector(unsigned(arbiter.tmo_cnt) + 1);
end if;
end if;
end process arbiter_sync;
-- FSM - combinatorial part --
arbiter_comb: process(arbiter, ctrl, xip_addr, phy_if, xip_req, spi_data_hi, spi_data_lo, spi_trigger)
begin
-- arbiter defaults --
arbiter.state_nxt <= arbiter.state;
-- bus interface defaults --
xip_rsp.data <= (others => '0');
xip_rsp.ack <= '0';
xip_rsp.err <= arbiter.xip_acc_err;
-- SPI PHY interface defaults --
phy_if.start <= '0';
phy_if.final <= arbiter.tmo_cnt(arbiter.tmo_cnt'left) or (not bool_to_ulogic_f(XIP_CACHE_EN)); -- terminate if timeout or if burst mode not enabled
phy_if.wdata <= ctrl(ctrl_rd_cmd7_c downto ctrl_rd_cmd0_c) & xip_addr & x"00000000"; -- MSB-aligned: CMD + address + 32-bit zero data
-- fsm --
case arbiter.state is
when S_DIRECT => -- XIP access disabled; direct SPI access
-- ------------------------------------------------------------
phy_if.wdata <= spi_data_hi & spi_data_lo & x"00"; -- MSB-aligned data
phy_if.start <= spi_trigger;
phy_if.final <= '1'; -- do not keep CS active after transmission is done
arbiter.state_nxt <= S_IDLE;
when S_IDLE => -- wait for new bus request
-- ------------------------------------------------------------
if (xip_req.stb = '1') then
if (xip_req.rw = '0') then
arbiter.state_nxt <= S_CHECK;
else
arbiter.state_nxt <= S_ERROR;
end if;
end if;
when S_CHECK => -- check if we can resume flash access
-- ------------------------------------------------------------
if (arbiter.addr(27 downto 2) = arbiter.addr_lookahead(27 downto 2)) and XIP_CACHE_EN and -- access to *next linear* address
(arbiter.tmo_cnt(arbiter.tmo_cnt'left) = '0') then -- no "pending access" timeout yet
phy_if.start <= '1'; -- resume flash access
arbiter.state_nxt <= S_BUSY;
else
phy_if.final <= '1'; -- restart flash access
arbiter.state_nxt <= S_TRIG;
end if;
when S_TRIG => -- trigger NEW flash read
-- ------------------------------------------------------------
phy_if.start <= '1';
arbiter.state_nxt <= S_BUSY;
when S_BUSY => -- wait for PHY to complete operation
-- ------------------------------------------------------------
xip_rsp.data <= bswap32_f(phy_if.rdata); -- convert incrementing byte-read to little-endian
if (phy_if.busy = '0') then
xip_rsp.ack <= '1';
arbiter.state_nxt <= S_IDLE;
end if;
when S_ERROR => -- access error
-- ------------------------------------------------------------
xip_rsp.err <= '1';
arbiter.state_nxt <= S_IDLE;
when others => -- undefined
-- ------------------------------------------------------------
arbiter.state_nxt <= S_IDLE;
end case;
end process arbiter_comb;
-- arbiter status --
arbiter.busy <= '1' when (arbiter.state = S_TRIG) or (arbiter.state = S_BUSY) else '0'; -- actual XIP access in progress
-- SPI Clock Generator --------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
clock_generator: process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
spi_clk_en <= '0';
cdiv_cnt <= (others => '0');
elsif rising_edge(clk_i) then
spi_clk_en <= '0'; -- default
if (ctrl(ctrl_enable_c) = '0') then -- reset/disabled
cdiv_cnt <= (others => '0');
elsif (clkgen_i(to_integer(unsigned(ctrl(ctrl_spi_prsc2_c downto ctrl_spi_prsc0_c)))) = '1') or
(ctrl(ctrl_highspeed_c) = '1') then -- pre-scaled clock
if (cdiv_cnt = ctrl(ctrl_cdiv3_c downto ctrl_cdiv0_c)) then -- clock divider for fine-tuning
spi_clk_en <= '1';
cdiv_cnt <= (others => '0');
else
cdiv_cnt <= std_ulogic_vector(unsigned(cdiv_cnt) + 1);
end if;
end if;
end if;
end process clock_generator;
-- enable clock generator --
clkgen_en_o <= ctrl(ctrl_enable_c);
-- SPI Physical Interface -----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
neorv32_xip_phy_inst: neorv32_xip_phy
port map (
-- global control --
rstn_i => rstn_i,
clk_i => clk_i,
spi_clk_en_i => spi_clk_en,
-- operation configuration --
cf_enable_i => ctrl(ctrl_enable_c), -- module enable (reset if low)
cf_cpha_i => ctrl(ctrl_spi_cpha_c), -- clock phase
cf_cpol_i => ctrl(ctrl_spi_cpol_c), -- clock idle polarity
-- operation control --
op_start_i => phy_if.start, -- trigger new transmission
op_final_i => phy_if.final, -- end current transmission
op_csen_i => ctrl(ctrl_spi_csen_c), -- actually enabled device for transmission
op_busy_o => phy_if.busy, -- transmission in progress when set
op_nbytes_i => ctrl(ctrl_spi_nbytes3_c downto ctrl_spi_nbytes0_c), -- actual number of bytes to transmit
op_wdata_i => phy_if.wdata, -- write data
op_rdata_o => phy_if.rdata, -- read data
-- SPI interface --
spi_csn_o => spi_csn_o,
spi_clk_o => spi_clk_o,
spi_dat_i => spi_dat_i,
spi_dat_o => spi_dat_o
);
end neorv32_xip_rtl;
-- ############################################################################################################################
-- ############################################################################################################################
-- #################################################################################################
-- # << NEORV32 - XIP Module - SPI Physical Interface >> #
-- # ********************************************************************************************* #
-- # BSD 3-Clause License #
-- # #
-- # The NEORV32 RISC-V Processor, https://github.com/stnolting/neorv32 #
-- # Copyright (c) 2024, Stephan Nolting. All rights reserved. #
-- # #
-- # Redistribution and use in source and binary forms, with or without modification, are #
-- # permitted provided that the following conditions are met: #
-- # #
-- # 1. Redistributions of source code must retain the above copyright notice, this list of #
-- # conditions and the following disclaimer. #
-- # #
-- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of #
-- # conditions and the following disclaimer in the documentation and/or other materials #
-- # provided with the distribution. #
-- # #
-- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to #
-- # endorse or promote products derived from this software without specific prior written #
-- # permission. #
-- # #
-- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS #
-- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF #
-- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE #
-- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
-- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE #
-- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED #
-- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
-- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED #
-- # OF THE POSSIBILITY OF SUCH DAMAGE. #
-- #################################################################################################
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
library neorv32;
use neorv32.neorv32_package.all;
entity neorv32_xip_phy is
port (
-- global control --
rstn_i : in std_ulogic; -- reset, async, low-active
clk_i : in std_ulogic; -- clock
spi_clk_en_i : in std_ulogic; -- pre-scaled SPI clock-enable
-- operation configuration --
cf_enable_i : in std_ulogic; -- module enable (reset if low)
cf_cpha_i : in std_ulogic; -- clock phase
cf_cpol_i : in std_ulogic; -- clock idle polarity
-- operation control --
op_start_i : in std_ulogic; -- trigger new transmission
op_final_i : in std_ulogic; -- end current transmission
op_csen_i : in std_ulogic; -- actually enabled device for transmission
op_busy_o : out std_ulogic; -- transmission in progress when set
op_nbytes_i : in std_ulogic_vector(03 downto 0); -- actual number of bytes to transmit (1..9)
op_wdata_i : in std_ulogic_vector(71 downto 0); -- write data
op_rdata_o : out std_ulogic_vector(31 downto 0); -- read data
-- SPI interface --
spi_csn_o : out std_ulogic;
spi_clk_o : out std_ulogic;
spi_dat_i : in std_ulogic;
spi_dat_o : out std_ulogic
);
end neorv32_xip_phy;
architecture neorv32_xip_phy_rtl of neorv32_xip_phy is
-- serial engine --
type ctrl_state_t is (S_IDLE, S_WAIT, S_START, S_SYNC, S_RTX_A, S_RTX_B, S_DONE);
type ctrl_t is record
state : ctrl_state_t;
sreg : std_ulogic_vector(71 downto 0); -- only the lowest 32-bit are used as RX data
bitcnt : std_ulogic_vector(06 downto 0);
di_sync : std_ulogic;
csen : std_ulogic;
end record;
signal ctrl : ctrl_t;
begin
-- Serial Interface Engine ----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
serial_engine: process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
spi_clk_o <= '0';
spi_csn_o <= '1';
ctrl.state <= S_IDLE;
ctrl.csen <= '0';
ctrl.sreg <= (others => '0');
ctrl.bitcnt <= (others => '0');
ctrl.di_sync <= '0';
elsif rising_edge(clk_i) then
if (cf_enable_i = '0') then -- sync reset
spi_clk_o <= '0';
spi_csn_o <= '1';
ctrl.state <= S_IDLE;
ctrl.csen <= '0';
ctrl.sreg <= (others => '0');
ctrl.bitcnt <= (others => '0');
ctrl.di_sync <= '0';
else -- fsm
case ctrl.state is
when S_IDLE => -- wait for new transmission trigger
-- ------------------------------------------------------------
spi_csn_o <= '1'; -- flash disabled
spi_clk_o <= cf_cpol_i;
ctrl.bitcnt <= op_nbytes_i & "000"; -- number of bytes
ctrl.csen <= op_csen_i;
if (op_start_i = '1') then
ctrl.state <= S_START;
end if;
when S_START => -- start of transmission (keep current spi_csn_o state!)
-- ------------------------------------------------------------
ctrl.sreg <= op_wdata_i;
if (spi_clk_en_i = '1') then
ctrl.state <= S_SYNC;
end if;
when S_WAIT => -- wait for resume transmission trigger
-- ------------------------------------------------------------
spi_csn_o <= not ctrl.csen; -- keep CS active
ctrl.bitcnt <= "0100000"; -- 4 bytes = 32-bit read data
if (op_final_i = '1') then -- terminate pending flash access
ctrl.state <= S_IDLE;
elsif (op_start_i = '1') then -- resume flash access
ctrl.state <= S_SYNC;
end if;
when S_SYNC => -- synchronize SPI clock
-- ------------------------------------------------------------
spi_csn_o <= not ctrl.csen; -- enable flash
if (spi_clk_en_i = '1') then
if (cf_cpha_i = '1') then -- clock phase shift
spi_clk_o <= not cf_cpol_i;
end if;
ctrl.state <= S_RTX_A;
end if;
when S_RTX_A => -- first half of bit transmission
-- ------------------------------------------------------------
if (spi_clk_en_i = '1') then
spi_clk_o <= not (cf_cpha_i xor cf_cpol_i);
ctrl.di_sync <= spi_dat_i;
ctrl.bitcnt <= std_ulogic_vector(unsigned(ctrl.bitcnt) - 1);
ctrl.state <= S_RTX_B;
end if;
when S_RTX_B => -- second half of bit transmission
-- ------------------------------------------------------------
if (spi_clk_en_i = '1') then
ctrl.sreg <= ctrl.sreg(ctrl.sreg'left-1 downto 0) & ctrl.di_sync;
if (or_reduce_f(ctrl.bitcnt) = '0') then -- all bits transferred?
spi_clk_o <= cf_cpol_i;
ctrl.state <= S_DONE; -- transmission done
else
spi_clk_o <= cf_cpha_i xor cf_cpol_i;
ctrl.state <= S_RTX_A; -- next bit
end if;
end if;
when S_DONE => -- transmission done
-- ------------------------------------------------------------
if (spi_clk_en_i = '1') then
ctrl.state <= S_WAIT;
end if;
when others => -- undefined
-- ------------------------------------------------------------
ctrl.state <= S_IDLE;
end case;
end if;
end if;
end process serial_engine;
-- serial unit busy --
op_busy_o <= '0' when (ctrl.state = S_IDLE) or (ctrl.state = S_WAIT) else '1';
-- serial data output --
spi_dat_o <= ctrl.sreg(ctrl.sreg'left);
-- RX data --
op_rdata_o <= ctrl.sreg(31 downto 0);
end neorv32_xip_phy_rtl;
-- ############################################################################################################################
-- ############################################################################################################################
-- #################################################################################################
-- # << NEORV32 - XIP Cache >> #
-- # ********************************************************************************************* #
-- # Simple directed-mapped read-only cache to accelerate XIP (SPI) flash accesses. #
-- # ********************************************************************************************* #
-- # BSD 3-Clause License #
-- # #
-- # The NEORV32 RISC-V Processor, https://github.com/stnolting/neorv32 #
-- # Copyright (c) 2024, Stephan Nolting. All rights reserved. #
-- # #
-- # Redistribution and use in source and binary forms, with or without modification, are #
-- # permitted provided that the following conditions are met: #
-- # #
-- # 1. Redistributions of source code must retain the above copyright notice, this list of #
-- # conditions and the following disclaimer. #
-- # #
-- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of #
-- # conditions and the following disclaimer in the documentation and/or other materials #
-- # provided with the distribution. #
-- # #
-- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to #
-- # endorse or promote products derived from this software without specific prior written #
-- # permission. #
-- # #
-- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS #
-- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF #
-- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE #
-- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
-- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE #
-- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED #
-- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
-- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED #
-- # OF THE POSSIBILITY OF SUCH DAMAGE. #
-- #################################################################################################
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
library neorv32;
use neorv32.neorv32_package.all;
entity neorv32_xip_cache is
generic (
CACHE_NUM_BLOCKS : natural range 1 to 256; -- number of blocks (min 1), has to be a power of 2
CACHE_BLOCK_SIZE : natural range 1 to 2**16 -- block size in bytes (min 4), has to be a power of 2
);
port (
clk_i : in std_ulogic; -- global clock, rising edge
rstn_i : in std_ulogic; -- global reset, low-active, async
clear_i : in std_ulogic; -- cache clear
cpu_req_i : in bus_req_t; -- request bus
cpu_rsp_o : out bus_rsp_t; -- response bus
bus_req_o : out bus_req_t; -- request bus
bus_rsp_i : in bus_rsp_t -- response bus
);
end neorv32_xip_cache;
architecture neorv32_xip_cache_rtl of neorv32_xip_cache is
-- auto configuration --
constant block_num_c : natural := cond_sel_natural_f(is_power_of_two_f(CACHE_NUM_BLOCKS), CACHE_NUM_BLOCKS, 2**index_size_f(CACHE_NUM_BLOCKS));
constant block_size_c : natural := cond_sel_natural_f(is_power_of_two_f(CACHE_BLOCK_SIZE), CACHE_BLOCK_SIZE, 2**index_size_f(CACHE_BLOCK_SIZE));
constant offset_size_c : natural := index_size_f(block_size_c/4); -- offset addresses full 32-bit words
-- cache layout --
constant index_size_c : natural := index_size_f(block_num_c);
constant tag_size_c : natural := 32 - (offset_size_c + index_size_c + 2); -- 2 additional bits for byte offset
constant entries_c : natural := block_num_c * (block_size_c/4); -- number of 32-bit entries (per set)
-- cache interface --
type cache_if_t is record
host_rdata : std_ulogic_vector(31 downto 0); -- cpu read data
host_rderr : std_ulogic; -- cpu read error
hit : std_ulogic; -- hit access
ctrl_en : std_ulogic; -- control access enable
ctrl_we : std_ulogic; -- control write enable
end record;
signal cache : cache_if_t;
-- control engine --
type ctrl_engine_state_t is (S_IDLE, S_CHECK, S_DOWNLOAD_REQ, S_DOWNLOAD_GET, S_RESYNC, S_ERROR);
signal state, state_nxt : ctrl_engine_state_t; -- FSM state
signal addr_reg, addr_reg_nxt : std_ulogic_vector(31 downto 0); -- address register for block download
-- cache memory --
type tag_mem_t is array (0 to block_num_c-1) of std_ulogic_vector(tag_size_c-1 downto 0);
type data_mem_t is array (0 to entries_c-1) of std_ulogic_vector(31+1 downto 0); -- data word + ERR status
signal tag_mem : tag_mem_t;
signal data_mem : data_mem_t;
signal tag_rd : std_ulogic_vector(tag_size_c-1 downto 0); -- tag read data
signal data_rd : std_ulogic_vector(31+1 downto 0); -- data word + ERR status
signal valid_mem : std_ulogic_vector(block_num_c-1 downto 0);
signal valid_rd : std_ulogic; -- valid flag read data
-- access address decomposition --
type acc_addr_t is record
tag : std_ulogic_vector(tag_size_c-1 downto 0);
index : std_ulogic_vector(index_size_c-1 downto 0);
offset : std_ulogic_vector(offset_size_c-1 downto 0);
end record;
signal host_acc, ctrl_acc : acc_addr_t;
-- cache data memory access --
signal cache_index : std_ulogic_vector(index_size_c-1 downto 0);
signal cache_offset : std_ulogic_vector(offset_size_c-1 downto 0);
signal cache_addr : std_ulogic_vector((index_size_c+offset_size_c)-1 downto 0); -- index & offset
begin
-- Control Engine FSM Sync ----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
ctrl_engine_fsm_sync: process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
state <= S_IDLE;
addr_reg <= (others => '0');
elsif rising_edge(clk_i) then
state <= state_nxt;
addr_reg <= addr_reg_nxt;
end if;
end process ctrl_engine_fsm_sync;
-- Control Engine FSM Comb ----------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
ctrl_engine_fsm_comb: process(state, addr_reg, cache, clear_i, cpu_req_i, bus_rsp_i)
begin
-- control defaults --
state_nxt <= state;
addr_reg_nxt <= addr_reg;
-- cache defaults --
cache.ctrl_en <= '0';
cache.ctrl_we <= '0';
-- host response defaults --
cpu_rsp_o.ack <= '0';
cpu_rsp_o.err <= '0';
cpu_rsp_o.data <= (others => '0');
-- bus interface defaults --
bus_req_o.data <= (others => '0');
bus_req_o.ben <= (others => '0');
bus_req_o.src <= cpu_req_i.src;
bus_req_o.priv <= cpu_req_i.priv;
bus_req_o.addr <= addr_reg;
bus_req_o.rw <= '0'; -- read-only
bus_req_o.stb <= '0';
bus_req_o.rvso <= cpu_req_i.rvso;
bus_req_o.fence <= cpu_req_i.fence;
-- fsm --
case state is
when S_IDLE => -- wait for host access request or cache control operation
-- ------------------------------------------------------------
if (cpu_req_i.stb = '1') then
if (cpu_req_i.rw = '1') or (clear_i = '1') then -- write access or cache being cleared
state_nxt <= S_ERROR;
else -- actual cache access
state_nxt <= S_CHECK;
end if;
end if;
when S_CHECK => -- finalize host access if cache hit
-- ------------------------------------------------------------
-- calculate block base address in case we need to download it --
addr_reg_nxt <= cpu_req_i.addr;
addr_reg_nxt((offset_size_c+2)-1 downto 0) <= (others => '0'); -- block-aligned
--
cpu_rsp_o.data <= cache.host_rdata; -- output read data in case we have a hit
if (cache.hit = '1') then -- cache HIT
cpu_rsp_o.err <= cache.host_rderr;
cpu_rsp_o.ack <= not cache.host_rderr;
state_nxt <= S_IDLE;
else -- cache MISS
state_nxt <= S_DOWNLOAD_REQ;
end if;
when S_DOWNLOAD_REQ => -- download new cache block: request new word
-- ------------------------------------------------------------
bus_req_o.stb <= '1'; -- request new read transfer
state_nxt <= S_DOWNLOAD_GET;
when S_DOWNLOAD_GET => -- download new cache block: wait for bus response
-- ------------------------------------------------------------
cache.ctrl_en <= '1'; -- cache update operation
if (bus_rsp_i.ack = '1') or (bus_rsp_i.err = '1') then -- ACK or ERROR = write to cache and get next word (store ERROR flag in cache)
cache.ctrl_we <= '1'; -- write to cache
if (and_reduce_f(addr_reg((offset_size_c+2)-1 downto 2)) = '1') then -- block complete?
state_nxt <= S_RESYNC;
else -- get next word
addr_reg_nxt <= std_ulogic_vector(unsigned(addr_reg) + 4);
state_nxt <= S_DOWNLOAD_REQ;
end if;
end if;
when S_RESYNC => -- re-sync host/cache access: cache read-latency dummy cycle
-- ------------------------------------------------------------
state_nxt <= S_CHECK;
when others => -- S_ERROR: error
-- ------------------------------------------------------------
cpu_rsp_o.err <= '1';
state_nxt <= S_IDLE;
end case;
end process ctrl_engine_fsm_comb;
-- Access Address Decomposition -----------------------------------------------------------
-- -------------------------------------------------------------------------------------------
host_acc.tag <= cpu_req_i.addr(31 downto 31-(tag_size_c-1));
host_acc.index <= cpu_req_i.addr(31-tag_size_c downto 2+offset_size_c);
host_acc.offset <= cpu_req_i.addr(2+(offset_size_c-1) downto 2); -- discard byte offset
ctrl_acc.tag <= addr_reg(31 downto 31-(tag_size_c-1));
ctrl_acc.index <= addr_reg(31-tag_size_c downto 2+offset_size_c);
ctrl_acc.offset <= addr_reg(2+(offset_size_c-1) downto 2); -- discard byte offset
-- Status Flag Memory ---------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
status_memory: process(rstn_i, clk_i) -- single-port RAM
begin
if (rstn_i = '0') then
valid_mem <= (others => '0');
valid_rd <= '0';
elsif rising_edge(clk_i) then
if (clear_i = '1') then -- invalidate cache
valid_mem <= (others => '0');
elsif (cache.ctrl_we = '1') then -- make current block valid
valid_mem(to_integer(unsigned(cache_index))) <= '1';
end if;
valid_rd <= valid_mem(to_integer(unsigned(cache_index)));
end if;
end process status_memory;
-- Cache Data Memory ----------------------------------------------------------------------
-- -------------------------------------------------------------------------------------------
cache_memory: process(clk_i) -- single-port RAM
begin
if rising_edge(clk_i) then -- no reset to allow mapping to blockRAM
if (cache.ctrl_we = '1') then -- update cache block
data_mem(to_integer(unsigned(cache_addr))) <= bus_rsp_i.err & bus_rsp_i.data;
tag_mem(to_integer(unsigned(cache_index))) <= ctrl_acc.tag;
end if;
data_rd <= data_mem(to_integer(unsigned(cache_addr)));
tag_rd <= tag_mem(to_integer(unsigned(cache_index)));
end if;
end process cache_memory;
-- cache access select --
cache_index <= host_acc.index when (cache.ctrl_en = '0') else ctrl_acc.index;
cache_offset <= host_acc.offset when (cache.ctrl_en = '0') else ctrl_acc.offset;
cache_addr <= cache_index & cache_offset; -- resulting ram access address
-- hit = tag match and valid entry --
cache.hit <= '1' when (host_acc.tag = tag_rd) and (valid_rd = '1') else '0';
-- data output --
cache.host_rdata <= data_rd(31 downto 0);
cache.host_rderr <= data_rd(32);
end neorv32_xip_cache_rtl;