|
------------------------------------------------------------------------------
|
|
---- ----
|
|
---- ZPU Medium ----
|
|
---- ----
|
|
---- http://www.opencores.org/ ----
|
|
---- ----
|
|
---- Description: ----
|
|
---- ZPU is a 32 bits small stack cpu. This is the medium size version. ----
|
|
---- Supports external memories. ----
|
|
---- ----
|
|
---- To Do: ----
|
|
---- - ----
|
|
---- ----
|
|
---- Author: ----
|
|
---- - Øyvind Harboe, oyvind.harboe zylin.com ----
|
|
---- - Salvador E. Tropea, salvador inti.gob.ar ----
|
|
---- ----
|
|
------------------------------------------------------------------------------
|
|
---- ----
|
|
---- Copyright (c) 2008 Øyvind Harboe <oyvind.harboe zylin.com> ----
|
|
---- Copyright (c) 2008 Salvador E. Tropea <salvador inti.gob.ar> ----
|
|
---- Copyright (c) 2008 Instituto Nacional de Tecnología Industrial ----
|
|
---- ----
|
|
---- Distributed under the BSD license ----
|
|
---- ----
|
|
------------------------------------------------------------------------------
|
|
---- ----
|
|
---- Design unit: ZPUMediumCore(Behave) (Entity and architecture) ----
|
|
---- File name: zpu_medium.vhdl ----
|
|
---- Note: None ----
|
|
---- Limitations: None known ----
|
|
---- Errors: None known ----
|
|
---- Library: zpu ----
|
|
---- Dependencies: IEEE.std_logic_1164 ----
|
|
---- IEEE.numeric_std ----
|
|
---- zpu.zpupkg ----
|
|
---- Target FPGA: Spartan 3 (XC3S400-4-FT256) ----
|
|
---- Language: VHDL ----
|
|
---- Wishbone: No ----
|
|
---- Synthesis tools: Xilinx Release 9.2.03i - xst J.39 ----
|
|
---- Simulation tools: GHDL [Sokcho edition] (0.2x) ----
|
|
---- Text editor: SETEdit 0.5.x ----
|
|
---- ----
|
|
------------------------------------------------------------------------------
|
|
--
|
|
-- write_en_o - set to '1' for a single cycle to send off a write request.
|
|
-- data_o is valid only while write_en_o='1'.
|
|
-- read_en_o - set to '1' for a single cycle to send off a read request.
|
|
-- mem_busy_i - It is illegal to send off a read/write request when
|
|
-- mem_busy_i='1'.
|
|
-- Set to '0' when data_i is valid after a read request.
|
|
-- If it goes to '1'(busy), it is on the cycle after read/
|
|
-- write_en_o is '1'.
|
|
-- addr_o - address for read/write request
|
|
-- data_i - read data. Valid only on the cycle after mem_busy_i='0'
|
|
-- after read_en_o='1' for a single cycle.
|
|
-- data_o - data to write
|
|
-- break_o - set to '1' when CPU hits break instruction
|
|
|
|
library IEEE;
|
|
use IEEE.std_logic_1164.all;
|
|
use IEEE.numeric_std.all;
|
|
|
|
library work;
|
|
use work.zpupkg.all;
|
|
|
|
entity ZPUMediumCore is
|
|
generic(
|
|
WORD_SIZE : integer:=32; -- 16/32 (2**wordPower)
|
|
ADDR_W : integer:=16; -- Total address space width (incl. I/O)
|
|
MEM_W : integer:=15; -- Memory (prog+data+stack) width
|
|
D_CARE_VAL : std_logic:='X'; -- Value used to fill the unsused bits
|
|
MULT_PIPE : boolean:=false; -- Pipeline multiplication
|
|
BINOP_PIPE : integer range 0 to 2:=0; -- Pipeline binary operations (-, =, < and <=)
|
|
ENA_LEVEL0 : boolean:=true; -- eq, loadb, neqbranch and pushspadd
|
|
ENA_LEVEL1 : boolean:=true; -- lessthan, ulessthan, mult, storeb, callpcrel and sub
|
|
ENA_LEVEL2 : boolean:=false; -- lessthanorequal, ulessthanorequal, call and poppcrel
|
|
ENA_LSHR : boolean:=true; -- lshiftright
|
|
ENA_IDLE : boolean:=false; -- Enable the enable_i input
|
|
FAST_FETCH : boolean:=true); -- Merge the st_fetch with the st_execute states
|
|
port(
|
|
clk_i : in std_logic; -- CPU Clock
|
|
reset_i : in std_logic; -- Sync Reset
|
|
enable_i : in std_logic; -- Hold the CPU (after reset)
|
|
break_o : out std_logic; -- Break instruction executed
|
|
dbg_o : out zpu_dbgo_t; -- Debug outputs (i.e. trace log)
|
|
-- Memory interface
|
|
mem_busy_i : in std_logic; -- Memory is busy
|
|
data_i : in unsigned(WORD_SIZE-1 downto 0); -- Data from mem
|
|
data_o : out unsigned(WORD_SIZE-1 downto 0); -- Data to mem
|
|
addr_o : out unsigned(ADDR_W-1 downto 0); -- Memory address
|
|
write_en_o : out std_logic; -- Memory write enable (32-bit)
|
|
read_en_o : out std_logic; -- Memory read enable (32-bit)
|
|
byte_read_o : out std_logic;
|
|
byte_write_o : out std_logic;
|
|
short_write_o: out std_logic); -- never happens
|
|
end entity ZPUMediumCore;
|
|
|
|
architecture Behave of ZPUMediumCore is
|
|
constant BYTE_BITS : integer:=WORD_SIZE/16; -- # of bits in a word that addresses bytes
|
|
constant WORD_BYTES : integer:=WORD_SIZE/OPCODE_W;
|
|
constant MAX_ADDR_BIT : integer:=ADDR_W-2;
|
|
-- Stack Pointer initial value: BRAM size-8
|
|
constant SP_START_1 : unsigned(ADDR_W-1 downto 0):=to_unsigned((2**MEM_W)-8,ADDR_W);
|
|
constant SP_START : unsigned(ADDR_W-1 downto BYTE_BITS):=
|
|
SP_START_1(ADDR_W-1 downto BYTE_BITS);
|
|
|
|
-- Update [SP+1]. We hold it in b_r, this writes the value to memory.
|
|
procedure FlushB(signal we : out std_logic;
|
|
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal inc_sp : in unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal data : out unsigned(WORD_SIZE-1 downto 0);
|
|
signal b : in unsigned(WORD_SIZE-1 downto 0)) is
|
|
begin
|
|
we <= '1';
|
|
addr <= inc_sp;
|
|
data <= b;
|
|
end procedure FlushB;
|
|
|
|
-- Do a simple stack push, it is performed in the internal cache registers,
|
|
-- not in the real memory.
|
|
procedure Push(signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal a : in unsigned(WORD_SIZE-1 downto 0);
|
|
signal b : out unsigned(WORD_SIZE-1 downto 0)) is
|
|
begin
|
|
b <= a; -- Update cache [SP+1]=[SP]
|
|
sp <= sp-1;
|
|
end procedure Push;
|
|
|
|
-- Do a simple stack pop, it is performed in the internal cache registers,
|
|
-- not in the real memory.
|
|
procedure Pop(signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal a : out unsigned(WORD_SIZE-1 downto 0);
|
|
signal b : in unsigned(WORD_SIZE-1 downto 0)) is
|
|
begin
|
|
a <= b; -- Update cache [SP]=[SP+1]
|
|
sp <= sp+1;
|
|
end procedure Pop;
|
|
|
|
-- Expand a PC value to WORD_SIZE
|
|
function ExpandPC(v : unsigned(ADDR_W-1 downto 0)) return unsigned is
|
|
variable nv : unsigned(WORD_SIZE-1 downto 0);
|
|
begin
|
|
nv:=(others => '0');
|
|
nv(ADDR_W-1 downto 0):=v;
|
|
return nv;
|
|
end function ExpandPC;
|
|
|
|
-- Program counter
|
|
signal pc_r : unsigned(ADDR_W-1 downto 0):=(others => '0');
|
|
-- Stack pointer
|
|
signal sp_r : unsigned(ADDR_W-1 downto BYTE_BITS):=SP_START;
|
|
-- SP+1, SP+2 and SP-1 are very used, these are shortcuts
|
|
signal inc_sp : unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal inc_inc_sp : unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
-- a_r is a cache for the top of the stack [SP]
|
|
-- Note: as this is a stack CPU this is a very important register.
|
|
signal a_r : unsigned(WORD_SIZE-1 downto 0);
|
|
-- b_r is a cache for the next value in the stack [SP+1]
|
|
signal b_r : unsigned(WORD_SIZE-1 downto 0);
|
|
signal bin_op_res1_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
|
|
signal bin_op_res2_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
|
|
signal mult_res1_r : unsigned(WORD_SIZE-1 downto 0);
|
|
signal mult_res2_r : unsigned(WORD_SIZE-1 downto 0);
|
|
signal mult_res3_r : unsigned(WORD_SIZE-1 downto 0);
|
|
signal mult_a_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
|
|
signal mult_b_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
|
|
signal idim_r : std_logic;
|
|
signal write_en_r : std_logic;
|
|
signal byte_read_r : std_logic;
|
|
signal byte_write_r : std_logic;
|
|
signal short_write_r : std_logic;
|
|
signal read_en_r : std_logic;
|
|
signal addr_r : unsigned(ADDR_W-1 downto BYTE_BITS):=(others => '0');
|
|
signal addrl_r : unsigned(BYTE_BITS-1 downto 0):=(others => '0');
|
|
signal fetched_w_r : unsigned(WORD_SIZE-1 downto 0);
|
|
|
|
type state_t is(st_load2, st_popped, st_load_sp2, st_load_sp3, st_add_sp2,
|
|
st_fetch, st_execute, st_decode, st_decode2, st_resync,
|
|
st_store_sp2, st_resync2, st_resync3, st_loadb2,
|
|
st_mult2, st_mult3, st_mult5, st_mult4, st_binary_op_res2,
|
|
st_binary_op_res, st_idle);
|
|
signal state : state_t:=st_resync;
|
|
|
|
-- Go to st_fetch state or just do its work
|
|
procedure DoFetch(constant FAST : boolean;
|
|
signal state : out state_t;
|
|
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal pc : in unsigned(ADDR_W-1 downto 0);
|
|
signal re : out std_logic;
|
|
signal busy : in std_logic) is
|
|
begin
|
|
if FAST then
|
|
-- Equivalent to st_fetch
|
|
if busy='0' then
|
|
addr <= pc(ADDR_W-1 downto BYTE_BITS);
|
|
re <= '1';
|
|
state <= st_decode;
|
|
end if;
|
|
else
|
|
state <= st_fetch;
|
|
end if;
|
|
end procedure DoFetch;
|
|
|
|
-- Perform a "binary operation" (2 operands)
|
|
procedure DoBinOp(result : in unsigned(WORD_SIZE-1 downto 0);
|
|
signal state : out state_t;
|
|
signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal re : out std_logic;
|
|
signal dest : out unsigned(WORD_SIZE-1 downto 0);
|
|
signal dest_p : out unsigned(WORD_SIZE-1 downto 0);
|
|
constant DEPTH : natural) is
|
|
begin
|
|
if DEPTH=2 then
|
|
-- 2 clocks: st_binary_op_res+st_binary_op_res2
|
|
state <= st_binary_op_res;
|
|
dest_p <= result;
|
|
elsif DEPTH=1 then
|
|
-- 1 clock: st_binary_op_res2
|
|
state <= st_binary_op_res2;
|
|
dest_p <= result;
|
|
else -- 0 clocks
|
|
re <= '1';
|
|
addr <= sp+2;
|
|
sp <= sp+1;
|
|
dest <= result;
|
|
state <= st_popped;
|
|
end if;
|
|
end procedure DoBinOp;
|
|
|
|
-- Perform a boolean "binary operation" (2 operands)
|
|
procedure DoBinOpBool(result : in boolean;
|
|
signal state : out state_t;
|
|
signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
|
|
signal re : out std_logic;
|
|
signal dest : out unsigned(WORD_SIZE-1 downto 0);
|
|
signal dest_p : out unsigned(WORD_SIZE-1 downto 0);
|
|
constant DEPTH : natural) is
|
|
variable res : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
|
|
begin
|
|
if result then
|
|
res(0):='1';
|
|
end if;
|
|
DoBinOp(res,state,sp,addr,re,dest,dest_p,DEPTH);
|
|
end procedure DoBinOpBool;
|
|
|
|
type insn_t is (dec_add_top, dec_dup, dec_dup_stk_b, dec_pop, dec_add,
|
|
dec_or, dec_and, dec_store, dec_add_sp, dec_shift, dec_nop,
|
|
dec_im, dec_load_sp, dec_store_sp, dec_emulate, dec_load,
|
|
dec_push_sp, dec_pop_pc, dec_pop_pc_rel, dec_not, dec_flip,
|
|
dec_pop_sp, dec_neq_branch, dec_eq, dec_loadb, dec_mult,
|
|
dec_less_than, dec_less_than_or_equal, dec_lshr,
|
|
dec_u_less_than_or_equal, dec_u_less_than, dec_push_sp_add,
|
|
dec_call, dec_call_pc_rel, dec_sub, dec_break, dec_storeb,
|
|
dec_insn_fetch, dec_pop_down);
|
|
signal insn : insn_t;
|
|
type insn_array_t is array(0 to WORD_BYTES-1) of insn_t;
|
|
signal insns : insn_array_t;
|
|
type opcode_array_t is array(0 to WORD_BYTES-1) of unsigned(OPCODE_W-1 downto 0);
|
|
signal opcode_r : opcode_array_t;
|
|
begin
|
|
-- the memory subsystem will tell us one cycle later whether or
|
|
-- not it is busy
|
|
write_en_o <= write_en_r;
|
|
read_en_o <= read_en_r;
|
|
byte_read_o <= byte_read_r;
|
|
byte_write_o <= byte_write_r;
|
|
short_write_o <= short_write_r;
|
|
addr_o(ADDR_W-1 downto BYTE_BITS) <= addr_r;
|
|
addr_o(BYTE_BITS-1 downto 0) <= addrl_r;
|
|
|
|
-- SP+1 and +2
|
|
inc_sp <= sp_r+1;
|
|
inc_inc_sp <= sp_r+2;
|
|
|
|
opcode_control:
|
|
process (clk_i)
|
|
variable topcode : unsigned(OPCODE_W-1 downto 0);
|
|
variable ex_opcode : unsigned(OPCODE_W-1 downto 0);
|
|
variable sp_offset : unsigned(4 downto 0);
|
|
variable tsp_offset : unsigned(4 downto 0);
|
|
variable next_pc : unsigned(ADDR_W-1 downto 0);
|
|
variable tdecoded : insn_t;
|
|
variable tinsns : insn_array_t;
|
|
variable mult_res : unsigned(WORD_SIZE*2-1 downto 0);
|
|
variable ipc_low : integer range 0 to 3; -- Address inside a word (pc_r)
|
|
variable inpc_low : integer range 0 to 3; -- Address inside a word (next_pc)
|
|
variable not_lshr : std_logic:='1';
|
|
begin
|
|
if rising_edge(clk_i) then
|
|
break_o <= '0';
|
|
if reset_i='1' then
|
|
if ENA_IDLE then
|
|
state <= st_idle;
|
|
else
|
|
state <= st_resync;
|
|
end if;
|
|
sp_r <= SP_START;
|
|
pc_r <= (others => '0');
|
|
idim_r <= '0';
|
|
write_en_r <= '0';
|
|
byte_read_r <= '0';
|
|
byte_write_r <= '0';
|
|
short_write_r <= '0';
|
|
read_en_r <= '0';
|
|
mult_a_r <= (others => '0');
|
|
mult_b_r <= (others => '0');
|
|
dbg_o.b_inst <= '0';
|
|
-- Reseting add_r here makes XST fail to use BRAMs ?!
|
|
else -- reset_i='1'
|
|
if MULT_PIPE then
|
|
-- We must multiply unconditionally to get pipelined multiplication
|
|
mult_res:=mult_a_r*mult_b_r;
|
|
mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
|
|
mult_res2_r <= mult_res1_r;
|
|
mult_res3_r <= mult_res2_r;
|
|
mult_a_r <= (others => D_CARE_VAL);
|
|
mult_b_r <= (others => D_CARE_VAL);
|
|
end if;
|
|
|
|
if BINOP_PIPE=2 then
|
|
bin_op_res2_r <= bin_op_res1_r; -- pipeline a bit.
|
|
end if;
|
|
|
|
read_en_r <='0';
|
|
write_en_r <='0';
|
|
byte_read_r <= '0';
|
|
byte_write_r <= '0';
|
|
short_write_r <= '0';
|
|
-- Allow synthesis tools to load bogus values when we don't
|
|
-- care about the address and output data.
|
|
addr_r <= (others => D_CARE_VAL);
|
|
data_o <= (others => D_CARE_VAL);
|
|
addrl_r <= "00";
|
|
|
|
if (write_en_r='1') and (read_en_r='1') then
|
|
report "read/write collision" severity failure;
|
|
end if;
|
|
|
|
ipc_low:=to_integer(pc_r(BYTE_BITS-1 downto 0));
|
|
sp_offset(4):=not opcode_r(ipc_low)(4);
|
|
sp_offset(3 downto 0):=opcode_r(ipc_low)(3 downto 0);
|
|
next_pc:=pc_r+1;
|
|
|
|
-- Prepare trace snapshot
|
|
dbg_o.opcode <= opcode_r(ipc_low);
|
|
dbg_o.pc <= resize(pc_r,32);
|
|
dbg_o.stk_a <= resize(a_r,32);
|
|
dbg_o.stk_b <= resize(b_r,32);
|
|
dbg_o.b_inst <= '0';
|
|
dbg_o.sp <= (others => '0');
|
|
dbg_o.sp(ADDR_W-1 downto BYTE_BITS) <= sp_r;
|
|
|
|
case state is
|
|
when st_idle =>
|
|
if enable_i='1' then
|
|
state <= st_resync;
|
|
end if;
|
|
-- Initial state of ZPU, fetch top of stack (A/B) + first instruction
|
|
when st_resync =>
|
|
if mem_busy_i='0' then
|
|
addr_r <= sp_r;
|
|
read_en_r <= '1';
|
|
state <= st_resync2;
|
|
end if;
|
|
when st_resync2 =>
|
|
if mem_busy_i='0' then
|
|
a_r <= data_i;
|
|
addr_r <= inc_sp;
|
|
read_en_r <= '1';
|
|
state <= st_resync3;
|
|
end if;
|
|
when st_resync3 =>
|
|
if mem_busy_i='0' then
|
|
b_r <= data_i;
|
|
addr_r <= pc_r(ADDR_W-1 downto BYTE_BITS);
|
|
read_en_r <= '1';
|
|
state <= st_decode;
|
|
end if;
|
|
when st_decode =>
|
|
if mem_busy_i='0' then
|
|
-- Here we latch the fetched word to give one full clock
|
|
-- cycle to the instruction decoder. This could be removed
|
|
-- if using BRAMs and the decoder delay isn't important.
|
|
fetched_w_r <= data_i;
|
|
state <= st_decode2;
|
|
end if;
|
|
when st_decode2 =>
|
|
-- decode 4 instructions in parallel
|
|
for i in 0 to WORD_BYTES-1 loop
|
|
topcode:=fetched_w_r((WORD_BYTES-1-i+1)*8-1 downto (WORD_BYTES-1-i)*8);
|
|
|
|
tsp_offset(4):=not topcode(4);
|
|
tsp_offset(3 downto 0):=topcode(3 downto 0);
|
|
|
|
opcode_r(i) <= topcode;
|
|
if topcode(7 downto 7)=OPCODE_IM then
|
|
tdecoded:=dec_im;
|
|
elsif topcode(7 downto 5)=OPCODE_STORESP then
|
|
if tsp_offset=0 then
|
|
-- Special case, we can avoid a write
|
|
tdecoded:=dec_pop;
|
|
elsif tsp_offset=1 then
|
|
-- Special case, collision
|
|
tdecoded:=dec_pop_down;
|
|
else
|
|
tdecoded:=dec_store_sp;
|
|
end if;
|
|
elsif topcode(7 downto 5)=OPCODE_LOADSP then
|
|
if tsp_offset=0 then
|
|
tdecoded:=dec_dup;
|
|
elsif tsp_offset=1 then
|
|
tdecoded:=dec_dup_stk_b;
|
|
else
|
|
tdecoded:=dec_load_sp;
|
|
end if;
|
|
elsif topcode(7 downto 5)=OPCODE_EMULATE then
|
|
tdecoded:=dec_emulate;
|
|
if ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_NEQBRANCH then
|
|
tdecoded:=dec_neq_branch;
|
|
elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_EQ then
|
|
tdecoded:=dec_eq;
|
|
elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_LOADB then
|
|
tdecoded:=dec_loadb;
|
|
elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_PUSHSPADD then
|
|
tdecoded:=dec_push_sp_add;
|
|
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_LESSTHAN then
|
|
tdecoded:=dec_less_than;
|
|
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_ULESSTHAN then
|
|
tdecoded:=dec_u_less_than;
|
|
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_MULT then
|
|
tdecoded:=dec_mult;
|
|
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_STOREB then
|
|
tdecoded:=dec_storeb;
|
|
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_CALLPCREL then
|
|
tdecoded:=dec_call_pc_rel;
|
|
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_SUB then
|
|
tdecoded:=dec_sub;
|
|
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_LESSTHANOREQUAL then
|
|
tdecoded:=dec_less_than_or_equal;
|
|
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_ULESSTHANOREQUAL then
|
|
tdecoded:=dec_u_less_than_or_equal;
|
|
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_CALL then
|
|
tdecoded:=dec_call;
|
|
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_POPPCREL then
|
|
tdecoded:=dec_pop_pc_rel;
|
|
elsif ENA_LSHR and topcode(5 downto 0)=OPCODE_LSHIFTRIGHT then
|
|
tdecoded:=dec_lshr;
|
|
end if;
|
|
elsif topcode(7 downto 4)=OPCODE_ADDSP then
|
|
if tsp_offset=0 then
|
|
tdecoded:=dec_shift;
|
|
elsif tsp_offset=1 then
|
|
tdecoded:=dec_add_top;
|
|
else
|
|
tdecoded:=dec_add_sp;
|
|
end if;
|
|
else -- OPCODE_SHORT
|
|
case topcode(3 downto 0) is
|
|
when OPCODE_BREAK =>
|
|
tdecoded:=dec_break;
|
|
when OPCODE_PUSHSP =>
|
|
tdecoded:=dec_push_sp;
|
|
when OPCODE_POPPC =>
|
|
tdecoded:=dec_pop_pc;
|
|
when OPCODE_ADD =>
|
|
tdecoded:=dec_add;
|
|
when OPCODE_OR =>
|
|
tdecoded:=dec_or;
|
|
when OPCODE_AND =>
|
|
tdecoded:=dec_and;
|
|
when OPCODE_LOAD =>
|
|
tdecoded:=dec_load;
|
|
when OPCODE_NOT =>
|
|
tdecoded:=dec_not;
|
|
when OPCODE_FLIP =>
|
|
tdecoded:=dec_flip;
|
|
when OPCODE_STORE =>
|
|
tdecoded:=dec_store;
|
|
when OPCODE_POPSP =>
|
|
tdecoded:=dec_pop_sp;
|
|
when others => -- OPCODE_NOP and others
|
|
tdecoded:=dec_nop;
|
|
end case;
|
|
end if;
|
|
tinsns(i):=tdecoded;
|
|
end loop;
|
|
|
|
insn <= tinsns(ipc_low);
|
|
-- once we wrap, we need to fetch
|
|
tinsns(0):=dec_insn_fetch;
|
|
insns <= tinsns;
|
|
state <= st_execute;
|
|
|
|
-- Each instruction must:
|
|
--
|
|
-- 1. increase pc_r if applicable
|
|
-- 2. set next state if applicable
|
|
-- 3. do it's operation
|
|
when st_execute =>
|
|
-- Some shortcut to make the code readable:
|
|
inpc_low:=to_integer(next_pc(BYTE_BITS-1 downto 0));
|
|
ex_opcode:=opcode_r(ipc_low);
|
|
|
|
if (mem_busy_i = '0') then -- MWW, do not move on until mem is not busy!
|
|
insn <= insns(inpc_low);
|
|
|
|
-- Defaults used by most instructions
|
|
if insn/=dec_insn_fetch and insn/=dec_im then
|
|
dbg_o.b_inst <= '1';
|
|
idim_r <= '0';
|
|
end if;
|
|
|
|
case insn is
|
|
when dec_insn_fetch =>
|
|
-- Not a real instruction, fetch new instructions
|
|
DoFetch(FAST_FETCH,state,addr_r,pc_r,read_en_r,mem_busy_i);
|
|
when dec_im =>
|
|
-- Push(immediate value), IDIM=1
|
|
-- if IDIM=0 Push(signed(opcode & 0x7F)) else
|
|
-- Push((Pop()<<7)|(opcode&0x7F))
|
|
if mem_busy_i='0' then
|
|
dbg_o.b_inst <= '1';
|
|
idim_r <= '1';
|
|
pc_r <= pc_r+1;
|
|
if idim_r='1' then
|
|
-- We already started an IM sequence
|
|
-- Shift left 7 bits
|
|
a_r(WORD_SIZE-1 downto 7) <= a_r(WORD_SIZE-8 downto 0);
|
|
-- Put the new value
|
|
a_r(6 downto 0) <= ex_opcode(6 downto 0);
|
|
else
|
|
-- First IM, push the value sign extended
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
a_r <= unsigned(resize(signed(ex_opcode(6 downto 0)),WORD_SIZE));
|
|
Push(sp_r,a_r,b_r);
|
|
end if;
|
|
end if;
|
|
when dec_store_sp =>
|
|
-- [SP+Offset]=Pop()
|
|
if mem_busy_i='0' then
|
|
write_en_r <= '1';
|
|
addr_r <= sp_r+sp_offset;
|
|
data_o <= a_r;
|
|
Pop(sp_r,a_r,b_r);
|
|
-- We need to fetch B
|
|
state <= st_store_sp2;
|
|
end if;
|
|
when dec_load_sp =>
|
|
-- Push([SP+Offset])
|
|
if mem_busy_i='0' then
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
Push(sp_r,a_r,b_r);
|
|
-- We are flushing B cache, so we need more time to
|
|
-- read the value.
|
|
state <= st_load_sp2;
|
|
end if;
|
|
when dec_emulate =>
|
|
-- Push(PC+1), PC=Opcode[4:0]*32
|
|
if mem_busy_i='0' then
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
state <= st_fetch;
|
|
a_r <= ExpandPC(pc_r+1);
|
|
Push(sp_r,a_r,b_r);
|
|
-- The emulate address is:
|
|
-- 98 7654 3210
|
|
-- 0000 00aa aaa0 0000
|
|
pc_r <= (others => '0');
|
|
pc_r(9 downto 5) <= ex_opcode(4 downto 0);
|
|
end if;
|
|
when dec_call_pc_rel =>
|
|
-- t=Pop(), Push(PC+1), PC=PC+t
|
|
if mem_busy_i='0' and ENA_LEVEL1 then
|
|
state <= st_fetch;
|
|
a_r <= ExpandPC(pc_r+1);
|
|
pc_r <= pc_r+a_r(ADDR_W-1 downto 0);
|
|
end if;
|
|
when dec_call =>
|
|
-- t=Pop(), Push(PC+1), PC=t
|
|
if mem_busy_i='0' and ENA_LEVEL2 then
|
|
state <= st_fetch;
|
|
a_r <= ExpandPC(pc_r+1);
|
|
pc_r <= a_r(ADDR_W-1 downto 0);
|
|
end if;
|
|
when dec_add_sp =>
|
|
-- Push(Pop()+[SP+Offset])
|
|
if mem_busy_i='0' then
|
|
-- Read SP+Offset
|
|
state <= st_add_sp2;
|
|
read_en_r <= '1';
|
|
addr_r <= sp_r+sp_offset;
|
|
pc_r <= pc_r+1;
|
|
end if;
|
|
when dec_push_sp =>
|
|
-- Push(SP)
|
|
if mem_busy_i='0' then
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
pc_r <= pc_r+1;
|
|
a_r <= (others => '0');
|
|
a_r(ADDR_W-1 downto BYTE_BITS) <= sp_r;
|
|
Push(sp_r,a_r,b_r);
|
|
end if;
|
|
when dec_pop_pc =>
|
|
-- PC=Pop() (return)
|
|
if mem_busy_i='0' then
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
state <= st_resync;
|
|
pc_r <= a_r(ADDR_W-1 downto 0);
|
|
sp_r <= inc_sp;
|
|
end if;
|
|
when dec_pop_pc_rel =>
|
|
-- PC=PC+Pop()
|
|
if mem_busy_i='0' and ENA_LEVEL2 then
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
state <= st_resync;
|
|
pc_r <= a_r(ADDR_W-1 downto 0)+pc_r;
|
|
sp_r <= inc_sp;
|
|
end if;
|
|
when dec_add =>
|
|
-- Push(Pop()+Pop()) [A=A+B, SP++, update B]
|
|
if mem_busy_i='0' then
|
|
state <= st_popped;
|
|
a_r <= a_r+b_r;
|
|
read_en_r <= '1';
|
|
addr_r <= inc_inc_sp;
|
|
sp_r <= inc_sp;
|
|
end if;
|
|
when dec_sub =>
|
|
-- a=Pop(), b=Pop(), Push(b-a)
|
|
if mem_busy_i='0' and ENA_LEVEL1 then
|
|
DoBinOp(b_r-a_r,state,sp_r,addr_r,read_en_r,
|
|
a_r,bin_op_res1_r,BINOP_PIPE);
|
|
end if;
|
|
when dec_pop =>
|
|
-- Pop()
|
|
if mem_busy_i='0' then
|
|
state <= st_popped;
|
|
addr_r <= inc_inc_sp;
|
|
read_en_r <= '1';
|
|
Pop(sp_r,a_r,b_r);
|
|
end if;
|
|
when dec_pop_down =>
|
|
-- t=Pop(), Pop(), Push(t)
|
|
if mem_busy_i='0' then
|
|
-- PopDown leaves top of stack unchanged
|
|
state <= st_popped;
|
|
addr_r <= inc_inc_sp;
|
|
read_en_r <= '1';
|
|
sp_r <= inc_sp;
|
|
end if;
|
|
when dec_or =>
|
|
-- Push(Pop() or Pop())
|
|
if mem_busy_i='0' then
|
|
state <= st_popped;
|
|
a_r <= a_r or b_r;
|
|
read_en_r <= '1';
|
|
addr_r <= inc_inc_sp;
|
|
sp_r <= inc_sp;
|
|
end if;
|
|
when dec_and =>
|
|
-- Push(Pop() and Pop())
|
|
if mem_busy_i='0' then
|
|
state <= st_popped;
|
|
a_r <= a_r and b_r;
|
|
read_en_r <= '1';
|
|
addr_r <= inc_inc_sp;
|
|
sp_r <= inc_sp;
|
|
end if;
|
|
when dec_eq =>
|
|
-- a=Pop(), b=Pop(), Push(a=b ? 1 : 0)
|
|
if mem_busy_i='0' and ENA_LEVEL0 then
|
|
DoBinOpBool(a_r=b_r,state,sp_r,addr_r,read_en_r,
|
|
a_r,bin_op_res1_r,BINOP_PIPE);
|
|
end if;
|
|
when dec_u_less_than =>
|
|
-- a=Pop(), b=Pop(), Push(a<b ? 1 : 0)
|
|
if mem_busy_i='0' and ENA_LEVEL1 then
|
|
DoBinOpBool(a_r<b_r,state,sp_r,addr_r,read_en_r,
|
|
a_r,bin_op_res1_r,BINOP_PIPE);
|
|
end if;
|
|
when dec_u_less_than_or_equal =>
|
|
-- a=Pop(), b=Pop(), Push(a<=b ? 1 : 0)
|
|
if mem_busy_i='0' and ENA_LEVEL2 then
|
|
DoBinOpBool(a_r<=b_r,state,sp_r,addr_r,read_en_r,
|
|
a_r,bin_op_res1_r,BINOP_PIPE);
|
|
end if;
|
|
when dec_less_than =>
|
|
-- a=signed(Pop()), b=signed(Pop()), Push(a<b ? 1 : 0)
|
|
if mem_busy_i='0' and ENA_LEVEL1 then
|
|
DoBinOpBool(signed(a_r)<signed(b_r),state,sp_r,
|
|
addr_r,read_en_r,a_r,bin_op_res1_r,
|
|
BINOP_PIPE);
|
|
end if;
|
|
when dec_less_than_or_equal =>
|
|
-- a=signed(Pop()), b=signed(Pop()), Push(a<=b ? 1 : 0)
|
|
if mem_busy_i='0' and ENA_LEVEL2 then
|
|
DoBinOpBool(signed(a_r)<=signed(b_r),state,sp_r,
|
|
addr_r,read_en_r,a_r,bin_op_res1_r,
|
|
BINOP_PIPE);
|
|
end if;
|
|
when dec_load =>
|
|
-- Push([Pop()])
|
|
if mem_busy_i='0' then
|
|
state <= st_load2;
|
|
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
|
|
read_en_r <= '1';
|
|
pc_r <= pc_r+1;
|
|
end if;
|
|
when dec_dup =>
|
|
-- t=Pop(), Push(t), Push(t)
|
|
if mem_busy_i='0' then
|
|
pc_r <= pc_r+1;
|
|
-- A is dupped, no change
|
|
Push(sp_r,a_r,b_r);
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
end if;
|
|
when dec_dup_stk_b =>
|
|
-- Pop(), t=Pop(), Push(t), Push(t), Push(t)
|
|
if mem_busy_i='0' then
|
|
pc_r <= pc_r+1;
|
|
a_r <= b_r;
|
|
-- B goes to A
|
|
Push(sp_r,a_r,b_r);
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
end if;
|
|
when dec_store =>
|
|
-- a=Pop(), b=Pop(), [a]=b
|
|
if mem_busy_i='0' then
|
|
state <= st_resync;
|
|
pc_r <= pc_r+1;
|
|
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
|
|
data_o <= b_r;
|
|
write_en_r <= '1';
|
|
sp_r <= inc_inc_sp;
|
|
end if;
|
|
when dec_pop_sp =>
|
|
-- SP=Pop()
|
|
if mem_busy_i='0' then
|
|
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
|
|
state <= st_resync;
|
|
pc_r <= pc_r+1;
|
|
sp_r <= a_r(ADDR_W-1 downto BYTE_BITS);
|
|
end if;
|
|
when dec_nop =>
|
|
pc_r <= pc_r+1;
|
|
when dec_not =>
|
|
-- Push(not(Pop()))
|
|
pc_r <= pc_r+1;
|
|
a_r <= not a_r;
|
|
when dec_flip =>
|
|
-- Push(flip(Pop()))
|
|
pc_r <= pc_r+1;
|
|
for i in 0 to WORD_SIZE-1 loop
|
|
a_r(i) <= a_r(WORD_SIZE-1-i);
|
|
end loop;
|
|
when dec_add_top =>
|
|
-- a=Pop(), b=Pop(), Push(b), Push(a+b)
|
|
pc_r <= pc_r+1;
|
|
a_r <= a_r+b_r;
|
|
when dec_shift =>
|
|
-- Push(Pop()<<1) [equivalent to a=Pop(), Push(a+a)]
|
|
pc_r <= pc_r+1;
|
|
a_r(WORD_SIZE-1 downto 1) <= a_r(WORD_SIZE-2 downto 0);
|
|
a_r(0) <= '0';
|
|
when dec_push_sp_add =>
|
|
-- Push(Pop()+SP)
|
|
if ENA_LEVEL0 then
|
|
pc_r <= pc_r+1;
|
|
a_r <= (others => '0');
|
|
a_r(ADDR_W-1 downto BYTE_BITS) <=
|
|
a_r(ADDR_W-1-BYTE_BITS downto 0)+sp_r;
|
|
end if;
|
|
when dec_neq_branch =>
|
|
-- a=Pop(), b=Pop(), PC+=b==0 ? 1 : a
|
|
-- Branches are almost always taken as they form loops
|
|
if ENA_LEVEL0 then
|
|
sp_r <= inc_inc_sp;
|
|
-- Need to fetch stack again.
|
|
state <= st_resync;
|
|
if b_r/=0 then
|
|
pc_r <= a_r(ADDR_W-1 downto 0)+pc_r;
|
|
else
|
|
pc_r <= pc_r+1;
|
|
end if;
|
|
end if;
|
|
when dec_mult =>
|
|
-- Push(Pop()*Pop())
|
|
if ENA_LEVEL1 then
|
|
if MULT_PIPE then
|
|
mult_a_r <= a_r;
|
|
mult_b_r <= b_r;
|
|
state <= st_mult2;
|
|
else
|
|
mult_res:=a_r*b_r;
|
|
mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
|
|
state <= st_mult5;
|
|
end if;
|
|
end if;
|
|
when dec_break =>
|
|
-- Assert the break_o signal
|
|
--report "Break instruction encountered" severity failure;
|
|
break_o <= '1';
|
|
pc_r <= pc_r+1;
|
|
when dec_loadb =>
|
|
-- Push([Pop()] & 0xFF) (byte address)
|
|
if mem_busy_i='0' and ENA_LEVEL0 then
|
|
state <= st_loadb2;
|
|
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
|
|
addrl_r <= a_r(BYTE_BITS-1 downto 0);
|
|
--read_en_r <= '1';
|
|
byte_read_r <= '1';
|
|
pc_r <= pc_r+1;
|
|
end if;
|
|
when dec_storeb =>
|
|
-- [Pop()]=Pop() & 0xFF (byte address)
|
|
if mem_busy_i='0' and ENA_LEVEL1 then
|
|
state <= st_resync;
|
|
sp_r <= inc_inc_sp;
|
|
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
|
|
addrl_r <= a_r(BYTE_BITS-1 downto 0);
|
|
--write_en_r <= '1';
|
|
byte_write_r <= '1';
|
|
pc_r <= pc_r+1;
|
|
--data_o(WORD_SIZE-1 downto 8) <= (others=>'0');
|
|
--data_o(7 downto 0) <= b_r(7 downto 0);
|
|
data_o(WORD_SIZE-1 downto 0) <= b_r(7 downto 0)&b_r(7 downto 0)&b_r(7 downto 0)&b_r(7 downto 0);
|
|
end if;
|
|
when dec_lshr =>
|
|
-- a=Pop(), b=Pop(), Push(b>>(a&0x3F))
|
|
if ENA_LSHR then
|
|
-- This instruction takes more than one cycle.
|
|
-- We must avoid duplications in the trace log.
|
|
dbg_o.b_inst <= not_lshr;
|
|
not_lshr:='0';
|
|
if a_r(5 downto 0)=0 then -- Only 6 bits used
|
|
-- No more shifts
|
|
if mem_busy_i='0' then
|
|
state <= st_popped;
|
|
a_r <= b_r;
|
|
read_en_r <= '1';
|
|
addr_r <= inc_inc_sp;
|
|
sp_r <= inc_sp;
|
|
not_lshr:='1';
|
|
end if;
|
|
else -- More shifts needed
|
|
b_r <= "0"&b_r(WORD_SIZE-1 downto 1);
|
|
a_r(5 downto 0) <= a_r(5 downto 0)-1;
|
|
insn <= insn;
|
|
end if;
|
|
end if;
|
|
when others =>
|
|
-- Undefined behavior, we shouldn't get here.
|
|
-- It only helps synthesis tools.
|
|
sp_r <= (others => D_CARE_VAL);
|
|
report "Illegal decode instruction?!" severity failure;
|
|
--break_o <= '1';
|
|
end case;
|
|
end if;
|
|
-- The followup of operations that takes more than one execution clock
|
|
when st_store_sp2 =>
|
|
if mem_busy_i='0' then
|
|
addr_r <= inc_sp;
|
|
read_en_r <= '1';
|
|
state <= st_popped;
|
|
end if;
|
|
when st_load_sp2 =>
|
|
if mem_busy_i='0' then
|
|
state <= st_load_sp3;
|
|
-- Now we can read SP+Offset (SP already decremented)
|
|
read_en_r <= '1';
|
|
addr_r <= sp_r+sp_offset+1;
|
|
end if;
|
|
when st_load_sp3 =>
|
|
if mem_busy_i='0' then
|
|
-- Note: We can't increment PC in the decode stage
|
|
-- because it will modify sp_offset.
|
|
pc_r <= pc_r+1;
|
|
-- Finally we have the result in A
|
|
state <= st_execute;
|
|
a_r <= data_i;
|
|
end if;
|
|
when st_add_sp2 =>
|
|
if mem_busy_i='0' then
|
|
state <= st_execute;
|
|
a_r <= a_r+data_i;
|
|
end if;
|
|
when st_load2 =>
|
|
if mem_busy_i='0' then
|
|
a_r <= data_i;
|
|
state <= st_execute;
|
|
end if;
|
|
when st_loadb2 =>
|
|
if mem_busy_i='0' then
|
|
a_r(WORD_SIZE-1 downto 8) <= (others => '0');
|
|
a_r(7 downto 0) <= data_i(7 downto 0);
|
|
state <= st_execute;
|
|
end if;
|
|
when st_fetch =>
|
|
if mem_busy_i='0' then
|
|
addr_r <= pc_r(ADDR_W-1 downto BYTE_BITS);
|
|
read_en_r <= '1';
|
|
state <= st_decode;
|
|
end if;
|
|
-- The following states can be used to leave cycles free for
|
|
-- tools that can automagically decompose the multiplication
|
|
-- in various stages. Xilinx tools can do it to increase the
|
|
-- multipliers performance.
|
|
when st_mult2 =>
|
|
state <= st_mult3;
|
|
when st_mult3 =>
|
|
state <= st_mult4;
|
|
when st_mult4 =>
|
|
state <= st_mult5;
|
|
when st_mult5 =>
|
|
if mem_busy_i='0' then
|
|
if MULT_PIPE then
|
|
a_r <= mult_res3_r;
|
|
else
|
|
a_r <= mult_res1_r;
|
|
end if;
|
|
read_en_r <= '1';
|
|
addr_r <= inc_inc_sp;
|
|
sp_r <= inc_sp;
|
|
state <= st_popped;
|
|
end if;
|
|
when st_binary_op_res =>
|
|
-- BINOP_PIPE=2
|
|
state <= st_binary_op_res2;
|
|
when st_binary_op_res2 =>
|
|
-- BINOP_PIPE>=1
|
|
read_en_r <= '1';
|
|
addr_r <= inc_inc_sp;
|
|
sp_r <= inc_sp;
|
|
state <= st_popped;
|
|
if BINOP_PIPE=2 then
|
|
a_r <= bin_op_res2_r;
|
|
else -- 1
|
|
a_r <= bin_op_res1_r;
|
|
end if;
|
|
when st_popped =>
|
|
if mem_busy_i='0' then
|
|
-- Note: Moving this PC++ to the decoder seems to
|
|
-- consume more LUTs.
|
|
pc_r <= pc_r+1;
|
|
b_r <= data_i;
|
|
state <= st_execute;
|
|
end if;
|
|
when others =>
|
|
-- Undefined behavior, we shouldn't get here.
|
|
-- It only helps synthesis tools.
|
|
sp_r <= (others => D_CARE_VAL);
|
|
report "Illegal state?!" severity failure;
|
|
--break_o <= '1';
|
|
end case; -- state
|
|
end if; -- else reset_i='1'
|
|
end if; -- rising_edge(clk_i)
|
|
end process opcode_control;
|
|
end architecture Behave; -- Entity: ZPUMediumCore
|
|
|