Project

General

Profile

1 markw
------------------------------------------------------------------------------
---- ----
---- ZPU Medium ----
---- ----
---- http://www.opencores.org/ ----
---- ----
---- Description: ----
---- ZPU is a 32 bits small stack cpu. This is the medium size version. ----
---- Supports external memories. ----
---- ----
---- To Do: ----
---- - ----
---- ----
---- Author: ----
---- - Øyvind Harboe, oyvind.harboe zylin.com ----
---- - Salvador E. Tropea, salvador inti.gob.ar ----
---- ----
------------------------------------------------------------------------------
---- ----
---- Copyright (c) 2008 Øyvind Harboe <oyvind.harboe zylin.com> ----
---- Copyright (c) 2008 Salvador E. Tropea <salvador inti.gob.ar> ----
---- Copyright (c) 2008 Instituto Nacional de Tecnología Industrial ----
---- ----
---- Distributed under the BSD license ----
---- ----
------------------------------------------------------------------------------
---- ----
---- Design unit: ZPUMediumCore(Behave) (Entity and architecture) ----
---- File name: zpu_medium.vhdl ----
---- Note: None ----
---- Limitations: None known ----
---- Errors: None known ----
---- Library: zpu ----
---- Dependencies: IEEE.std_logic_1164 ----
---- IEEE.numeric_std ----
---- zpu.zpupkg ----
---- Target FPGA: Spartan 3 (XC3S400-4-FT256) ----
---- Language: VHDL ----
---- Wishbone: No ----
---- Synthesis tools: Xilinx Release 9.2.03i - xst J.39 ----
---- Simulation tools: GHDL [Sokcho edition] (0.2x) ----
---- Text editor: SETEdit 0.5.x ----
---- ----
------------------------------------------------------------------------------
--
-- write_en_o - set to '1' for a single cycle to send off a write request.
-- data_o is valid only while write_en_o='1'.
-- read_en_o - set to '1' for a single cycle to send off a read request.
-- mem_busy_i - It is illegal to send off a read/write request when
-- mem_busy_i='1'.
-- Set to '0' when data_i is valid after a read request.
-- If it goes to '1'(busy), it is on the cycle after read/
-- write_en_o is '1'.
-- addr_o - address for read/write request
-- data_i - read data. Valid only on the cycle after mem_busy_i='0'
-- after read_en_o='1' for a single cycle.
-- data_o - data to write
-- break_o - set to '1' when CPU hits break instruction

library IEEE;
use IEEE.std_logic_1164.all;
use IEEE.numeric_std.all;

library work;
use work.zpupkg.all;

entity ZPUMediumCore is
generic(
WORD_SIZE : integer:=32; -- 16/32 (2**wordPower)
ADDR_W : integer:=16; -- Total address space width (incl. I/O)
MEM_W : integer:=15; -- Memory (prog+data+stack) width
D_CARE_VAL : std_logic:='X'; -- Value used to fill the unsused bits
MULT_PIPE : boolean:=false; -- Pipeline multiplication
BINOP_PIPE : integer range 0 to 2:=0; -- Pipeline binary operations (-, =, < and <=)
ENA_LEVEL0 : boolean:=true; -- eq, loadb, neqbranch and pushspadd
ENA_LEVEL1 : boolean:=true; -- lessthan, ulessthan, mult, storeb, callpcrel and sub
ENA_LEVEL2 : boolean:=false; -- lessthanorequal, ulessthanorequal, call and poppcrel
ENA_LSHR : boolean:=true; -- lshiftright
ENA_IDLE : boolean:=false; -- Enable the enable_i input
FAST_FETCH : boolean:=true); -- Merge the st_fetch with the st_execute states
port(
clk_i : in std_logic; -- CPU Clock
reset_i : in std_logic; -- Sync Reset
enable_i : in std_logic; -- Hold the CPU (after reset)
break_o : out std_logic; -- Break instruction executed
dbg_o : out zpu_dbgo_t; -- Debug outputs (i.e. trace log)
-- Memory interface
mem_busy_i : in std_logic; -- Memory is busy
data_i : in unsigned(WORD_SIZE-1 downto 0); -- Data from mem
data_o : out unsigned(WORD_SIZE-1 downto 0); -- Data to mem
addr_o : out unsigned(ADDR_W-1 downto 0); -- Memory address
write_en_o : out std_logic; -- Memory write enable (32-bit)
read_en_o : out std_logic; -- Memory read enable (32-bit)
byte_read_o : out std_logic;
byte_write_o : out std_logic;
short_write_o: out std_logic); -- never happens
end entity ZPUMediumCore;

architecture Behave of ZPUMediumCore is
constant BYTE_BITS : integer:=WORD_SIZE/16; -- # of bits in a word that addresses bytes
constant WORD_BYTES : integer:=WORD_SIZE/OPCODE_W;
constant MAX_ADDR_BIT : integer:=ADDR_W-2;
-- Stack Pointer initial value: BRAM size-8
constant SP_START_1 : unsigned(ADDR_W-1 downto 0):=to_unsigned((2**MEM_W)-8,ADDR_W);
constant SP_START : unsigned(ADDR_W-1 downto BYTE_BITS):=
SP_START_1(ADDR_W-1 downto BYTE_BITS);

-- Update [SP+1]. We hold it in b_r, this writes the value to memory.
procedure FlushB(signal we : out std_logic;
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
signal inc_sp : in unsigned(ADDR_W-1 downto BYTE_BITS);
signal data : out unsigned(WORD_SIZE-1 downto 0);
signal b : in unsigned(WORD_SIZE-1 downto 0)) is
begin
we <= '1';
addr <= inc_sp;
data <= b;
end procedure FlushB;

-- Do a simple stack push, it is performed in the internal cache registers,
-- not in the real memory.
procedure Push(signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
signal a : in unsigned(WORD_SIZE-1 downto 0);
signal b : out unsigned(WORD_SIZE-1 downto 0)) is
begin
b <= a; -- Update cache [SP+1]=[SP]
sp <= sp-1;
end procedure Push;

-- Do a simple stack pop, it is performed in the internal cache registers,
-- not in the real memory.
procedure Pop(signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
signal a : out unsigned(WORD_SIZE-1 downto 0);
signal b : in unsigned(WORD_SIZE-1 downto 0)) is
begin
a <= b; -- Update cache [SP]=[SP+1]
sp <= sp+1;
end procedure Pop;

-- Expand a PC value to WORD_SIZE
function ExpandPC(v : unsigned(ADDR_W-1 downto 0)) return unsigned is
variable nv : unsigned(WORD_SIZE-1 downto 0);
begin
nv:=(others => '0');
nv(ADDR_W-1 downto 0):=v;
return nv;
end function ExpandPC;

-- Program counter
signal pc_r : unsigned(ADDR_W-1 downto 0):=(others => '0');
-- Stack pointer
signal sp_r : unsigned(ADDR_W-1 downto BYTE_BITS):=SP_START;
-- SP+1, SP+2 and SP-1 are very used, these are shortcuts
signal inc_sp : unsigned(ADDR_W-1 downto BYTE_BITS);
signal inc_inc_sp : unsigned(ADDR_W-1 downto BYTE_BITS);
-- a_r is a cache for the top of the stack [SP]
-- Note: as this is a stack CPU this is a very important register.
signal a_r : unsigned(WORD_SIZE-1 downto 0);
-- b_r is a cache for the next value in the stack [SP+1]
signal b_r : unsigned(WORD_SIZE-1 downto 0);
signal bin_op_res1_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
signal bin_op_res2_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
signal mult_res1_r : unsigned(WORD_SIZE-1 downto 0);
signal mult_res2_r : unsigned(WORD_SIZE-1 downto 0);
signal mult_res3_r : unsigned(WORD_SIZE-1 downto 0);
signal mult_a_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
signal mult_b_r : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
signal idim_r : std_logic;
signal write_en_r : std_logic;
signal byte_read_r : std_logic;
signal byte_write_r : std_logic;
signal short_write_r : std_logic;
signal read_en_r : std_logic;
signal addr_r : unsigned(ADDR_W-1 downto BYTE_BITS):=(others => '0');
signal addrl_r : unsigned(BYTE_BITS-1 downto 0):=(others => '0');
signal fetched_w_r : unsigned(WORD_SIZE-1 downto 0);

type state_t is(st_load2, st_popped, st_load_sp2, st_load_sp3, st_add_sp2,
st_fetch, st_execute, st_decode, st_decode2, st_resync,
st_store_sp2, st_resync2, st_resync3, st_loadb2,
st_mult2, st_mult3, st_mult5, st_mult4, st_binary_op_res2,
st_binary_op_res, st_idle);
signal state : state_t:=st_resync;

-- Go to st_fetch state or just do its work
procedure DoFetch(constant FAST : boolean;
signal state : out state_t;
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
signal pc : in unsigned(ADDR_W-1 downto 0);
signal re : out std_logic;
signal busy : in std_logic) is
begin
if FAST then
-- Equivalent to st_fetch
if busy='0' then
addr <= pc(ADDR_W-1 downto BYTE_BITS);
re <= '1';
state <= st_decode;
end if;
else
state <= st_fetch;
end if;
end procedure DoFetch;

-- Perform a "binary operation" (2 operands)
procedure DoBinOp(result : in unsigned(WORD_SIZE-1 downto 0);
signal state : out state_t;
signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
signal re : out std_logic;
signal dest : out unsigned(WORD_SIZE-1 downto 0);
signal dest_p : out unsigned(WORD_SIZE-1 downto 0);
constant DEPTH : natural) is
begin
if DEPTH=2 then
-- 2 clocks: st_binary_op_res+st_binary_op_res2
state <= st_binary_op_res;
dest_p <= result;
elsif DEPTH=1 then
-- 1 clock: st_binary_op_res2
state <= st_binary_op_res2;
dest_p <= result;
else -- 0 clocks
re <= '1';
addr <= sp+2;
sp <= sp+1;
dest <= result;
state <= st_popped;
end if;
end procedure DoBinOp;

-- Perform a boolean "binary operation" (2 operands)
procedure DoBinOpBool(result : in boolean;
signal state : out state_t;
signal sp : inout unsigned(ADDR_W-1 downto BYTE_BITS);
signal addr : out unsigned(ADDR_W-1 downto BYTE_BITS);
signal re : out std_logic;
signal dest : out unsigned(WORD_SIZE-1 downto 0);
signal dest_p : out unsigned(WORD_SIZE-1 downto 0);
constant DEPTH : natural) is
variable res : unsigned(WORD_SIZE-1 downto 0):=(others => '0');
begin
if result then
res(0):='1';
end if;
DoBinOp(res,state,sp,addr,re,dest,dest_p,DEPTH);
end procedure DoBinOpBool;

type insn_t is (dec_add_top, dec_dup, dec_dup_stk_b, dec_pop, dec_add,
dec_or, dec_and, dec_store, dec_add_sp, dec_shift, dec_nop,
dec_im, dec_load_sp, dec_store_sp, dec_emulate, dec_load,
dec_push_sp, dec_pop_pc, dec_pop_pc_rel, dec_not, dec_flip,
dec_pop_sp, dec_neq_branch, dec_eq, dec_loadb, dec_mult,
dec_less_than, dec_less_than_or_equal, dec_lshr,
dec_u_less_than_or_equal, dec_u_less_than, dec_push_sp_add,
dec_call, dec_call_pc_rel, dec_sub, dec_break, dec_storeb,
dec_insn_fetch, dec_pop_down);
signal insn : insn_t;
type insn_array_t is array(0 to WORD_BYTES-1) of insn_t;
signal insns : insn_array_t;
type opcode_array_t is array(0 to WORD_BYTES-1) of unsigned(OPCODE_W-1 downto 0);
signal opcode_r : opcode_array_t;
begin
-- the memory subsystem will tell us one cycle later whether or
-- not it is busy
write_en_o <= write_en_r;
read_en_o <= read_en_r;
byte_read_o <= byte_read_r;
byte_write_o <= byte_write_r;
short_write_o <= short_write_r;
addr_o(ADDR_W-1 downto BYTE_BITS) <= addr_r;
addr_o(BYTE_BITS-1 downto 0) <= addrl_r;

-- SP+1 and +2
inc_sp <= sp_r+1;
inc_inc_sp <= sp_r+2;

opcode_control:
process (clk_i)
variable topcode : unsigned(OPCODE_W-1 downto 0);
variable ex_opcode : unsigned(OPCODE_W-1 downto 0);
variable sp_offset : unsigned(4 downto 0);
variable tsp_offset : unsigned(4 downto 0);
variable next_pc : unsigned(ADDR_W-1 downto 0);
variable tdecoded : insn_t;
variable tinsns : insn_array_t;
variable mult_res : unsigned(WORD_SIZE*2-1 downto 0);
variable ipc_low : integer range 0 to 3; -- Address inside a word (pc_r)
variable inpc_low : integer range 0 to 3; -- Address inside a word (next_pc)
variable not_lshr : std_logic:='1';
begin
if rising_edge(clk_i) then
break_o <= '0';
if reset_i='1' then
if ENA_IDLE then
state <= st_idle;
else
state <= st_resync;
end if;
sp_r <= SP_START;
pc_r <= (others => '0');
idim_r <= '0';
write_en_r <= '0';
byte_read_r <= '0';
byte_write_r <= '0';
short_write_r <= '0';
read_en_r <= '0';
mult_a_r <= (others => '0');
mult_b_r <= (others => '0');
dbg_o.b_inst <= '0';
-- Reseting add_r here makes XST fail to use BRAMs ?!
else -- reset_i='1'
if MULT_PIPE then
-- We must multiply unconditionally to get pipelined multiplication
mult_res:=mult_a_r*mult_b_r;
mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
mult_res2_r <= mult_res1_r;
mult_res3_r <= mult_res2_r;
mult_a_r <= (others => D_CARE_VAL);
mult_b_r <= (others => D_CARE_VAL);
end if;

if BINOP_PIPE=2 then
bin_op_res2_r <= bin_op_res1_r; -- pipeline a bit.
end if;

read_en_r <='0';
write_en_r <='0';
byte_read_r <= '0';
byte_write_r <= '0';
short_write_r <= '0';
-- Allow synthesis tools to load bogus values when we don't
-- care about the address and output data.
addr_r <= (others => D_CARE_VAL);
data_o <= (others => D_CARE_VAL);
addrl_r <= "00";

if (write_en_r='1') and (read_en_r='1') then
report "read/write collision" severity failure;
end if;

ipc_low:=to_integer(pc_r(BYTE_BITS-1 downto 0));
sp_offset(4):=not opcode_r(ipc_low)(4);
sp_offset(3 downto 0):=opcode_r(ipc_low)(3 downto 0);
next_pc:=pc_r+1;

-- Prepare trace snapshot
dbg_o.opcode <= opcode_r(ipc_low);
dbg_o.pc <= resize(pc_r,32);
dbg_o.stk_a <= resize(a_r,32);
dbg_o.stk_b <= resize(b_r,32);
dbg_o.b_inst <= '0';
dbg_o.sp <= (others => '0');
dbg_o.sp(ADDR_W-1 downto BYTE_BITS) <= sp_r;

case state is
when st_idle =>
if enable_i='1' then
state <= st_resync;
end if;
-- Initial state of ZPU, fetch top of stack (A/B) + first instruction
when st_resync =>
if mem_busy_i='0' then
addr_r <= sp_r;
read_en_r <= '1';
state <= st_resync2;
end if;
when st_resync2 =>
if mem_busy_i='0' then
a_r <= data_i;
addr_r <= inc_sp;
read_en_r <= '1';
state <= st_resync3;
end if;
when st_resync3 =>
if mem_busy_i='0' then
b_r <= data_i;
addr_r <= pc_r(ADDR_W-1 downto BYTE_BITS);
read_en_r <= '1';
state <= st_decode;
end if;
when st_decode =>
if mem_busy_i='0' then
-- Here we latch the fetched word to give one full clock
-- cycle to the instruction decoder. This could be removed
-- if using BRAMs and the decoder delay isn't important.
fetched_w_r <= data_i;
state <= st_decode2;
end if;
when st_decode2 =>
-- decode 4 instructions in parallel
for i in 0 to WORD_BYTES-1 loop
topcode:=fetched_w_r((WORD_BYTES-1-i+1)*8-1 downto (WORD_BYTES-1-i)*8);

tsp_offset(4):=not topcode(4);
tsp_offset(3 downto 0):=topcode(3 downto 0);

opcode_r(i) <= topcode;
if topcode(7 downto 7)=OPCODE_IM then
tdecoded:=dec_im;
elsif topcode(7 downto 5)=OPCODE_STORESP then
if tsp_offset=0 then
-- Special case, we can avoid a write
tdecoded:=dec_pop;
elsif tsp_offset=1 then
-- Special case, collision
tdecoded:=dec_pop_down;
else
tdecoded:=dec_store_sp;
end if;
elsif topcode(7 downto 5)=OPCODE_LOADSP then
if tsp_offset=0 then
tdecoded:=dec_dup;
elsif tsp_offset=1 then
tdecoded:=dec_dup_stk_b;
else
tdecoded:=dec_load_sp;
end if;
elsif topcode(7 downto 5)=OPCODE_EMULATE then
tdecoded:=dec_emulate;
if ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_NEQBRANCH then
tdecoded:=dec_neq_branch;
elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_EQ then
tdecoded:=dec_eq;
elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_LOADB then
tdecoded:=dec_loadb;
elsif ENA_LEVEL0 and topcode(5 downto 0)=OPCODE_PUSHSPADD then
tdecoded:=dec_push_sp_add;
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_LESSTHAN then
tdecoded:=dec_less_than;
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_ULESSTHAN then
tdecoded:=dec_u_less_than;
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_MULT then
tdecoded:=dec_mult;
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_STOREB then
tdecoded:=dec_storeb;
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_CALLPCREL then
tdecoded:=dec_call_pc_rel;
elsif ENA_LEVEL1 and topcode(5 downto 0)=OPCODE_SUB then
tdecoded:=dec_sub;
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_LESSTHANOREQUAL then
tdecoded:=dec_less_than_or_equal;
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_ULESSTHANOREQUAL then
tdecoded:=dec_u_less_than_or_equal;
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_CALL then
tdecoded:=dec_call;
elsif ENA_LEVEL2 and topcode(5 downto 0)=OPCODE_POPPCREL then
tdecoded:=dec_pop_pc_rel;
elsif ENA_LSHR and topcode(5 downto 0)=OPCODE_LSHIFTRIGHT then
tdecoded:=dec_lshr;
end if;
elsif topcode(7 downto 4)=OPCODE_ADDSP then
if tsp_offset=0 then
tdecoded:=dec_shift;
elsif tsp_offset=1 then
tdecoded:=dec_add_top;
else
tdecoded:=dec_add_sp;
end if;
else -- OPCODE_SHORT
case topcode(3 downto 0) is
when OPCODE_BREAK =>
tdecoded:=dec_break;
when OPCODE_PUSHSP =>
tdecoded:=dec_push_sp;
when OPCODE_POPPC =>
tdecoded:=dec_pop_pc;
when OPCODE_ADD =>
tdecoded:=dec_add;
when OPCODE_OR =>
tdecoded:=dec_or;
when OPCODE_AND =>
tdecoded:=dec_and;
when OPCODE_LOAD =>
tdecoded:=dec_load;
when OPCODE_NOT =>
tdecoded:=dec_not;
when OPCODE_FLIP =>
tdecoded:=dec_flip;
when OPCODE_STORE =>
tdecoded:=dec_store;
when OPCODE_POPSP =>
tdecoded:=dec_pop_sp;
when others => -- OPCODE_NOP and others
tdecoded:=dec_nop;
end case;
end if;
tinsns(i):=tdecoded;
end loop;

insn <= tinsns(ipc_low);
-- once we wrap, we need to fetch
tinsns(0):=dec_insn_fetch;
insns <= tinsns;
state <= st_execute;

-- Each instruction must:
--
-- 1. increase pc_r if applicable
-- 2. set next state if applicable
-- 3. do it's operation
when st_execute =>
-- Some shortcut to make the code readable:
inpc_low:=to_integer(next_pc(BYTE_BITS-1 downto 0));
ex_opcode:=opcode_r(ipc_low);

if (mem_busy_i = '0') then -- MWW, do not move on until mem is not busy!
insn <= insns(inpc_low);

-- Defaults used by most instructions
if insn/=dec_insn_fetch and insn/=dec_im then
dbg_o.b_inst <= '1';
idim_r <= '0';
end if;

case insn is
when dec_insn_fetch =>
-- Not a real instruction, fetch new instructions
DoFetch(FAST_FETCH,state,addr_r,pc_r,read_en_r,mem_busy_i);
when dec_im =>
-- Push(immediate value), IDIM=1
-- if IDIM=0 Push(signed(opcode & 0x7F)) else
-- Push((Pop()<<7)|(opcode&0x7F))
if mem_busy_i='0' then
dbg_o.b_inst <= '1';
idim_r <= '1';
pc_r <= pc_r+1;
if idim_r='1' then
-- We already started an IM sequence
-- Shift left 7 bits
a_r(WORD_SIZE-1 downto 7) <= a_r(WORD_SIZE-8 downto 0);
-- Put the new value
a_r(6 downto 0) <= ex_opcode(6 downto 0);
else
-- First IM, push the value sign extended
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
a_r <= unsigned(resize(signed(ex_opcode(6 downto 0)),WORD_SIZE));
Push(sp_r,a_r,b_r);
end if;
end if;
when dec_store_sp =>
-- [SP+Offset]=Pop()
if mem_busy_i='0' then
write_en_r <= '1';
addr_r <= sp_r+sp_offset;
data_o <= a_r;
Pop(sp_r,a_r,b_r);
-- We need to fetch B
state <= st_store_sp2;
end if;
when dec_load_sp =>
-- Push([SP+Offset])
if mem_busy_i='0' then
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
Push(sp_r,a_r,b_r);
-- We are flushing B cache, so we need more time to
-- read the value.
state <= st_load_sp2;
end if;
when dec_emulate =>
-- Push(PC+1), PC=Opcode[4:0]*32
if mem_busy_i='0' then
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
state <= st_fetch;
a_r <= ExpandPC(pc_r+1);
Push(sp_r,a_r,b_r);
-- The emulate address is:
-- 98 7654 3210
-- 0000 00aa aaa0 0000
pc_r <= (others => '0');
pc_r(9 downto 5) <= ex_opcode(4 downto 0);
end if;
when dec_call_pc_rel =>
-- t=Pop(), Push(PC+1), PC=PC+t
if mem_busy_i='0' and ENA_LEVEL1 then
state <= st_fetch;
a_r <= ExpandPC(pc_r+1);
pc_r <= pc_r+a_r(ADDR_W-1 downto 0);
end if;
when dec_call =>
-- t=Pop(), Push(PC+1), PC=t
if mem_busy_i='0' and ENA_LEVEL2 then
state <= st_fetch;
a_r <= ExpandPC(pc_r+1);
pc_r <= a_r(ADDR_W-1 downto 0);
end if;
when dec_add_sp =>
-- Push(Pop()+[SP+Offset])
if mem_busy_i='0' then
-- Read SP+Offset
state <= st_add_sp2;
read_en_r <= '1';
addr_r <= sp_r+sp_offset;
pc_r <= pc_r+1;
end if;
when dec_push_sp =>
-- Push(SP)
if mem_busy_i='0' then
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
pc_r <= pc_r+1;
a_r <= (others => '0');
a_r(ADDR_W-1 downto BYTE_BITS) <= sp_r;
Push(sp_r,a_r,b_r);
end if;
when dec_pop_pc =>
-- PC=Pop() (return)
if mem_busy_i='0' then
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
state <= st_resync;
pc_r <= a_r(ADDR_W-1 downto 0);
sp_r <= inc_sp;
end if;
when dec_pop_pc_rel =>
-- PC=PC+Pop()
if mem_busy_i='0' and ENA_LEVEL2 then
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
state <= st_resync;
pc_r <= a_r(ADDR_W-1 downto 0)+pc_r;
sp_r <= inc_sp;
end if;
when dec_add =>
-- Push(Pop()+Pop()) [A=A+B, SP++, update B]
if mem_busy_i='0' then
state <= st_popped;
a_r <= a_r+b_r;
read_en_r <= '1';
addr_r <= inc_inc_sp;
sp_r <= inc_sp;
end if;
when dec_sub =>
-- a=Pop(), b=Pop(), Push(b-a)
if mem_busy_i='0' and ENA_LEVEL1 then
DoBinOp(b_r-a_r,state,sp_r,addr_r,read_en_r,
a_r,bin_op_res1_r,BINOP_PIPE);
end if;
when dec_pop =>
-- Pop()
if mem_busy_i='0' then
state <= st_popped;
addr_r <= inc_inc_sp;
read_en_r <= '1';
Pop(sp_r,a_r,b_r);
end if;
when dec_pop_down =>
-- t=Pop(), Pop(), Push(t)
if mem_busy_i='0' then
-- PopDown leaves top of stack unchanged
state <= st_popped;
addr_r <= inc_inc_sp;
read_en_r <= '1';
sp_r <= inc_sp;
end if;
when dec_or =>
-- Push(Pop() or Pop())
if mem_busy_i='0' then
state <= st_popped;
a_r <= a_r or b_r;
read_en_r <= '1';
addr_r <= inc_inc_sp;
sp_r <= inc_sp;
end if;
when dec_and =>
-- Push(Pop() and Pop())
if mem_busy_i='0' then
state <= st_popped;
a_r <= a_r and b_r;
read_en_r <= '1';
addr_r <= inc_inc_sp;
sp_r <= inc_sp;
end if;
when dec_eq =>
-- a=Pop(), b=Pop(), Push(a=b ? 1 : 0)
if mem_busy_i='0' and ENA_LEVEL0 then
DoBinOpBool(a_r=b_r,state,sp_r,addr_r,read_en_r,
a_r,bin_op_res1_r,BINOP_PIPE);
end if;
when dec_u_less_than =>
-- a=Pop(), b=Pop(), Push(a<b ? 1 : 0)
if mem_busy_i='0' and ENA_LEVEL1 then
DoBinOpBool(a_r<b_r,state,sp_r,addr_r,read_en_r,
a_r,bin_op_res1_r,BINOP_PIPE);
end if;
when dec_u_less_than_or_equal =>
-- a=Pop(), b=Pop(), Push(a<=b ? 1 : 0)
if mem_busy_i='0' and ENA_LEVEL2 then
DoBinOpBool(a_r<=b_r,state,sp_r,addr_r,read_en_r,
a_r,bin_op_res1_r,BINOP_PIPE);
end if;
when dec_less_than =>
-- a=signed(Pop()), b=signed(Pop()), Push(a<b ? 1 : 0)
if mem_busy_i='0' and ENA_LEVEL1 then
DoBinOpBool(signed(a_r)<signed(b_r),state,sp_r,
addr_r,read_en_r,a_r,bin_op_res1_r,
BINOP_PIPE);
end if;
when dec_less_than_or_equal =>
-- a=signed(Pop()), b=signed(Pop()), Push(a<=b ? 1 : 0)
if mem_busy_i='0' and ENA_LEVEL2 then
DoBinOpBool(signed(a_r)<=signed(b_r),state,sp_r,
addr_r,read_en_r,a_r,bin_op_res1_r,
BINOP_PIPE);
end if;
when dec_load =>
-- Push([Pop()])
if mem_busy_i='0' then
state <= st_load2;
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
read_en_r <= '1';
pc_r <= pc_r+1;
end if;
when dec_dup =>
-- t=Pop(), Push(t), Push(t)
if mem_busy_i='0' then
pc_r <= pc_r+1;
-- A is dupped, no change
Push(sp_r,a_r,b_r);
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
end if;
when dec_dup_stk_b =>
-- Pop(), t=Pop(), Push(t), Push(t), Push(t)
if mem_busy_i='0' then
pc_r <= pc_r+1;
a_r <= b_r;
-- B goes to A
Push(sp_r,a_r,b_r);
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
end if;
when dec_store =>
-- a=Pop(), b=Pop(), [a]=b
if mem_busy_i='0' then
state <= st_resync;
pc_r <= pc_r+1;
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
data_o <= b_r;
write_en_r <= '1';
sp_r <= inc_inc_sp;
end if;
when dec_pop_sp =>
-- SP=Pop()
if mem_busy_i='0' then
FlushB(write_en_r,addr_r,inc_sp,data_o,b_r);
state <= st_resync;
pc_r <= pc_r+1;
sp_r <= a_r(ADDR_W-1 downto BYTE_BITS);
end if;
when dec_nop =>
pc_r <= pc_r+1;
when dec_not =>
-- Push(not(Pop()))
pc_r <= pc_r+1;
a_r <= not a_r;
when dec_flip =>
-- Push(flip(Pop()))
pc_r <= pc_r+1;
for i in 0 to WORD_SIZE-1 loop
a_r(i) <= a_r(WORD_SIZE-1-i);
end loop;
when dec_add_top =>
-- a=Pop(), b=Pop(), Push(b), Push(a+b)
pc_r <= pc_r+1;
a_r <= a_r+b_r;
when dec_shift =>
-- Push(Pop()<<1) [equivalent to a=Pop(), Push(a+a)]
pc_r <= pc_r+1;
a_r(WORD_SIZE-1 downto 1) <= a_r(WORD_SIZE-2 downto 0);
a_r(0) <= '0';
when dec_push_sp_add =>
-- Push(Pop()+SP)
if ENA_LEVEL0 then
pc_r <= pc_r+1;
a_r <= (others => '0');
a_r(ADDR_W-1 downto BYTE_BITS) <=
a_r(ADDR_W-1-BYTE_BITS downto 0)+sp_r;
end if;
when dec_neq_branch =>
-- a=Pop(), b=Pop(), PC+=b==0 ? 1 : a
-- Branches are almost always taken as they form loops
if ENA_LEVEL0 then
sp_r <= inc_inc_sp;
-- Need to fetch stack again.
state <= st_resync;
if b_r/=0 then
pc_r <= a_r(ADDR_W-1 downto 0)+pc_r;
else
pc_r <= pc_r+1;
end if;
end if;
when dec_mult =>
-- Push(Pop()*Pop())
if ENA_LEVEL1 then
if MULT_PIPE then
mult_a_r <= a_r;
mult_b_r <= b_r;
state <= st_mult2;
else
mult_res:=a_r*b_r;
mult_res1_r <= mult_res(WORD_SIZE-1 downto 0);
state <= st_mult5;
end if;
end if;
when dec_break =>
-- Assert the break_o signal
--report "Break instruction encountered" severity failure;
break_o <= '1';
pc_r <= pc_r+1;
when dec_loadb =>
-- Push([Pop()] & 0xFF) (byte address)
if mem_busy_i='0' and ENA_LEVEL0 then
state <= st_loadb2;
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
addrl_r <= a_r(BYTE_BITS-1 downto 0);
--read_en_r <= '1';
byte_read_r <= '1';
pc_r <= pc_r+1;
end if;
when dec_storeb =>
-- [Pop()]=Pop() & 0xFF (byte address)
if mem_busy_i='0' and ENA_LEVEL1 then
state <= st_resync;
sp_r <= inc_inc_sp;
addr_r <= a_r(ADDR_W-1 downto BYTE_BITS);
addrl_r <= a_r(BYTE_BITS-1 downto 0);
--write_en_r <= '1';
byte_write_r <= '1';
pc_r <= pc_r+1;
--data_o(WORD_SIZE-1 downto 8) <= (others=>'0');
--data_o(7 downto 0) <= b_r(7 downto 0);
data_o(WORD_SIZE-1 downto 0) <= b_r(7 downto 0)&b_r(7 downto 0)&b_r(7 downto 0)&b_r(7 downto 0);
end if;
when dec_lshr =>
-- a=Pop(), b=Pop(), Push(b>>(a&0x3F))
if ENA_LSHR then
-- This instruction takes more than one cycle.
-- We must avoid duplications in the trace log.
dbg_o.b_inst <= not_lshr;
not_lshr:='0';
if a_r(5 downto 0)=0 then -- Only 6 bits used
-- No more shifts
if mem_busy_i='0' then
state <= st_popped;
a_r <= b_r;
read_en_r <= '1';
addr_r <= inc_inc_sp;
sp_r <= inc_sp;
not_lshr:='1';
end if;
else -- More shifts needed
b_r <= "0"&b_r(WORD_SIZE-1 downto 1);
a_r(5 downto 0) <= a_r(5 downto 0)-1;
insn <= insn;
end if;
end if;
when others =>
-- Undefined behavior, we shouldn't get here.
-- It only helps synthesis tools.
sp_r <= (others => D_CARE_VAL);
report "Illegal decode instruction?!" severity failure;
--break_o <= '1';
end case;
end if;
-- The followup of operations that takes more than one execution clock
when st_store_sp2 =>
if mem_busy_i='0' then
addr_r <= inc_sp;
read_en_r <= '1';
state <= st_popped;
end if;
when st_load_sp2 =>
if mem_busy_i='0' then
state <= st_load_sp3;
-- Now we can read SP+Offset (SP already decremented)
read_en_r <= '1';
addr_r <= sp_r+sp_offset+1;
end if;
when st_load_sp3 =>
if mem_busy_i='0' then
-- Note: We can't increment PC in the decode stage
-- because it will modify sp_offset.
pc_r <= pc_r+1;
-- Finally we have the result in A
state <= st_execute;
a_r <= data_i;
end if;
when st_add_sp2 =>
if mem_busy_i='0' then
state <= st_execute;
a_r <= a_r+data_i;
end if;
when st_load2 =>
if mem_busy_i='0' then
a_r <= data_i;
state <= st_execute;
end if;
when st_loadb2 =>
if mem_busy_i='0' then
a_r(WORD_SIZE-1 downto 8) <= (others => '0');
a_r(7 downto 0) <= data_i(7 downto 0);
state <= st_execute;
end if;
when st_fetch =>
if mem_busy_i='0' then
addr_r <= pc_r(ADDR_W-1 downto BYTE_BITS);
read_en_r <= '1';
state <= st_decode;
end if;
-- The following states can be used to leave cycles free for
-- tools that can automagically decompose the multiplication
-- in various stages. Xilinx tools can do it to increase the
-- multipliers performance.
when st_mult2 =>
state <= st_mult3;
when st_mult3 =>
state <= st_mult4;
when st_mult4 =>
state <= st_mult5;
when st_mult5 =>
if mem_busy_i='0' then
if MULT_PIPE then
a_r <= mult_res3_r;
else
a_r <= mult_res1_r;
end if;
read_en_r <= '1';
addr_r <= inc_inc_sp;
sp_r <= inc_sp;
state <= st_popped;
end if;
when st_binary_op_res =>
-- BINOP_PIPE=2
state <= st_binary_op_res2;
when st_binary_op_res2 =>
-- BINOP_PIPE>=1
read_en_r <= '1';
addr_r <= inc_inc_sp;
sp_r <= inc_sp;
state <= st_popped;
if BINOP_PIPE=2 then
a_r <= bin_op_res2_r;
else -- 1
a_r <= bin_op_res1_r;
end if;
when st_popped =>
if mem_busy_i='0' then
-- Note: Moving this PC++ to the decoder seems to
-- consume more LUTs.
pc_r <= pc_r+1;
b_r <= data_i;
state <= st_execute;
end if;
when others =>
-- Undefined behavior, we shouldn't get here.
-- It only helps synthesis tools.
sp_r <= (others => D_CARE_VAL);
report "Illegal state?!" severity failure;
--break_o <= '1';
end case; -- state
end if; -- else reset_i='1'
end if; -- rising_edge(clk_i)
end process opcode_control;
end architecture Behave; -- Entity: ZPUMediumCore