Ah, well, using SRAM then you do have a lot of room to improve the memory access. :-)
I'll have to give the Xilinx SDRAM controller a try. It looked very confusing and complicated the first time I started messing with SDRAM, so I rolled my own controller based on a simpler design I found online (hamster works). The problem with the burst mode and 64-bit transfers is that the 9900 does not really have that kind of access pattern, and you need a memory controller and cache to even begin to take advantage of any memory access over 16-bits. For completely random access, the read access time is still about 70ns on even the fastest SDRAM.
The configuration options for the Spartan 6 memory controller block (MCB) seemed daunting at first, and I could not find any VHDL examples, only one example with Verilog. But at the end of the day with the Pipistrello board it was relatively easy to setup, as the Micron memory chip was one of the supported targets on the ISE 14.7.
You are absolutely right about the burst mode and the 9900, it is not a good fit. Like you say at least a small cache would be necessary. Also the Xilinx MCB is not really byte or word or even 64-bit word oriented, since it has FIFOs both for writing and reading. Great for performance, but complex for simple setups. In the only design I have used it in (the TI-99/4A 64Meg memory expansion) I think I configured it for 32-bit access width and only wrote single words to and from the FIFOs. Basically for writes you first push your data to the write fifo, provide an address and a count, and off it goes. For reads I used a separate port (the MCB is a big piece of machinery with 6 access ports if I remember correctly) and in that port I needed to first write read address and count, then it goes off and does its thing, and while doing it one can monitor when the read FIFO starts to have data in it. So it is designed for asynchronous accesses. Should be a good fit for cache line reads and writes.
Below is the component declaration as I use it - this works (below not shown are the state machines driving reads and writes, something like 300 lines of code):
-- LPDDR controller declaration component erik_lpddr generic( C3_P0_MASK_SIZE : integer := 4; C3_P0_DATA_PORT_SIZE : integer := 32; C3_P1_MASK_SIZE : integer := 4; C3_P1_DATA_PORT_SIZE : integer := 32; C3_MEMCLK_PERIOD : integer := 5000; C3_RST_ACT_LOW : integer := 0; C3_INPUT_CLK_TYPE : string := "SINGLE_ENDED"; C3_CALIB_SOFT_IP : string := "TRUE"; C3_SIMULATION : string := "FALSE"; DEBUG_EN : integer := 0; C3_MEM_ADDR_ORDER : string := "ROW_BANK_COLUMN"; C3_NUM_DQ_PINS : integer := 16; C3_MEM_ADDR_WIDTH : integer := 13; C3_MEM_BANKADDR_WIDTH : integer := 2 ); port ( mcb3_dram_dq : inout std_logic_vector(C3_NUM_DQ_PINS-1 downto 0); mcb3_dram_a : out std_logic_vector(C3_MEM_ADDR_WIDTH-1 downto 0); mcb3_dram_ba : out std_logic_vector(C3_MEM_BANKADDR_WIDTH-1 downto 0); mcb3_dram_cke : out std_logic; mcb3_dram_ras_n : out std_logic; mcb3_dram_cas_n : out std_logic; mcb3_dram_we_n : out std_logic; mcb3_dram_dm : out std_logic; mcb3_dram_udqs : inout std_logic; mcb3_rzq : inout std_logic; mcb3_dram_udm : out std_logic; c3_sys_clk : in std_logic; c3_sys_rst_n : in std_logic; c3_calib_done : out std_logic; c3_clk0 : out std_logic; c3_rst0 : out std_logic; mcb3_dram_dqs : inout std_logic; mcb3_dram_ck : out std_logic; mcb3_dram_ck_n : out std_logic; -- Below is the first R/W port to LPDDR memory c3_p0_cmd_clk : in std_logic; c3_p0_cmd_en : in std_logic; c3_p0_cmd_instr : in std_logic_vector(2 downto 0); c3_p0_cmd_bl : in std_logic_vector(5 downto 0); c3_p0_cmd_byte_addr : in std_logic_vector(29 downto 0); c3_p0_cmd_empty : out std_logic; c3_p0_cmd_full : out std_logic; c3_p0_wr_clk : in std_logic; c3_p0_wr_en : in std_logic; c3_p0_wr_mask : in std_logic_vector(C3_P0_MASK_SIZE - 1 downto 0); c3_p0_wr_data : in std_logic_vector(C3_P0_DATA_PORT_SIZE - 1 downto 0); c3_p0_wr_full : out std_logic; c3_p0_wr_error : out std_logic; c3_p0_wr_count : out std_logic_vector(6 downto 0); c3_p0_wr_underrun : out std_logic; c3_p0_wr_empty : out std_logic; c3_p0_rd_clk : in std_logic; c3_p0_rd_en : in std_logic; c3_p0_rd_data : out std_logic_vector(C3_P0_DATA_PORT_SIZE - 1 downto 0); c3_p0_rd_full : out std_logic; c3_p0_rd_empty : out std_logic; c3_p0_rd_count : out std_logic_vector(6 downto 0); c3_p0_rd_overflow : out std_logic; c3_p0_rd_error : out std_logic; -- Below is the second R/W port to LPDDR memory c3_p1_cmd_clk : in std_logic; c3_p1_cmd_en : in std_logic; c3_p1_cmd_instr : in std_logic_vector(2 downto 0); c3_p1_cmd_bl : in std_logic_vector(5 downto 0); c3_p1_cmd_byte_addr : in std_logic_vector(29 downto 0); c3_p1_cmd_empty : out std_logic; c3_p1_cmd_full : out std_logic; c3_p1_wr_clk : in std_logic; c3_p1_wr_en : in std_logic; c3_p1_wr_mask : in std_logic_vector(C3_P1_MASK_SIZE - 1 downto 0); c3_p1_wr_data : in std_logic_vector(C3_P1_DATA_PORT_SIZE - 1 downto 0); c3_p1_wr_full : out std_logic; c3_p1_wr_empty : out std_logic; c3_p1_wr_count : out std_logic_vector(6 downto 0); c3_p1_wr_underrun : out std_logic; c3_p1_wr_error : out std_logic; c3_p1_rd_clk : in std_logic; c3_p1_rd_en : in std_logic; c3_p1_rd_data : out std_logic_vector(C3_P1_DATA_PORT_SIZE - 1 downto 0); c3_p1_rd_full : out std_logic; c3_p1_rd_empty : out std_logic; c3_p1_rd_count : out std_logic_vector(6 downto 0); c3_p1_rd_overflow : out std_logic; c3_p1_rd_error : out std_logic ); end component;