-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathf18a_gpu.vhd
2453 lines (2013 loc) · 93.1 KB
/
f18a_gpu.vhd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
--
-- F18A
-- A pin-compatible enhanced replacement for the TMS9918A VDP family.
-- https://dnotq.io
--
-- Released under the 3-Clause BSD License:
--
-- Copyright 2011-2018 Matthew Hagerty (matthew <at> dnotq <dot> io)
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions are met:
--
-- 1. Redistributions of source code must retain the above copyright notice,
-- this list of conditions and the following disclaimer.
--
-- 2. Redistributions in binary form must reproduce the above copyright
-- notice, this list of conditions and the following disclaimer in the
-- documentation and/or other materials provided with the distribution.
--
-- 3. Neither the name of the copyright holder nor the names of its
-- contributors may be used to endorse or promote products derived from this
-- software without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-- POSSIBILITY OF SUCH DAMAGE.
-- Version history. See README.md for details.
--
-- V1.9 Dec 31, 2018
-- V1.8 Aug 24, 2016
-- V1.7 Jan 1, 2016
-- V1.6 May 3, 2014 .. Apr 26, 2015
-- V1.5 Jul 23, 2013
-- V1.4 Mar 20, 2013 .. Apr 26, 2013
-- V1.3 Jul 26, 2012, Release firmware
-- 100MHz TMS9900-compatible CPU (called the "GPU" in the F18A)
--
-- Notable differences between this implementation and the original 9900:
--
-- Does not implement all instructions.
--
-- Certain instructions are modified for alternate use.
--
-- Does not attempt to maintain original instruction timing.
--
-- The 16 general purpose registers (R0..R15) are a real register-file and
-- not implemented in RAM.
--
-- Uses a hard-coded instruction decode and control vs. a microcoded control
-- model of the original 9900.
--
-- Does not use the ALU for PC and other registers calculations. Dedicated
-- adders are used instead.
--
-- The GPU has a not-so-great interface with the F18A host-CPU interface and
-- will be blocked at certain points to prevent VRAM contention. This really
-- needs to be reworked, if only to make the implementation simpler (and
-- probably use less FPGA resources).
--
-- Most instructions take around 60ns to 150ns depending on memory access, and
-- have a 1-clock execute cycle.
--
-- The MUL, DIV, and Shift instructions are much faster than the original 9900
-- CPU. The execute cycle for MUL is 1-clock (10ns) like other instructions.
-- The DIV and Shift instructions take a maximum of 16-clock cycles for the
-- execution cycle.
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.std_logic_unsigned.all;
entity f18a_gpu is
port (
clk : in std_logic;
rst_n : in std_logic; -- reset and load PC, active low
trigger : in std_logic; -- trigger the GPU
running : out std_logic; -- '1' if the GPU is running, '0' when idle
pause : in std_logic; -- pause the GPU, active high
pause_ack : out std_logic; -- acknowledge pause
load_pc : in std_logic_vector(0 to 15);
-- VRAM Interface
vdin : in std_logic_vector(0 to 7);
vwe : out std_logic;
vaddr : out std_logic_vector(0 to 13);
vdout : out std_logic_vector(0 to 7);
-- Palette Interface
pdin : in std_logic_vector(0 to 11);
pwe : out std_logic;
paddr : out std_logic_vector(0 to 5);
pdout : out std_logic_vector(0 to 11);
-- Register Interface
rdin : in std_logic_vector(0 to 7);
raddr : out std_logic_vector(0 to 13);
rwe : out std_logic; -- write enable for VDP registers
-- Data inputs
scanline : in std_logic_vector(0 to 7);
blank : in std_logic; -- '1' when blanking (horz and vert)
bmlba : in std_logic_vector(0 to 7); -- bitmap layer base address
bml_w : in std_logic_vector(0 to 7); -- bitmap layer width
pgba : in std_logic; -- pattern generator base address
-- Data output, 7-bits of user defined status
gstatus : out std_logic_vector(0 to 6);
-- SPI Interface
spi_clk : out std_logic;
spi_cs : out std_logic;
spi_mosi : out std_logic;
spi_miso : in std_logic
);
end f18a_gpu;
architecture rtl of f18a_gpu is
-- **NOTE**
-- These are also defined in the CPU module to avoid using actual paths
-- and resources to transfer a constant to the GPU module.
constant VMAJOR : std_logic_vector(0 to 3) := X"1";
constant VMINOR : std_logic_vector(0 to 3) := X"9";
-- 2K private dedicated RAM for the GPU
type gpuram_type is array (0 to 2047) of std_logic_vector(0 to 7);
signal gpuram : gpuram_type := (
x"02",x"0F",x"47",x"FE",x"10",x"0D",x"40",x"36",x"40",x"5A",x"40",x"94",x"40",x"B4",x"40",x"FA",
x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",x"FF",
x"0C",x"A0",x"41",x"1C",x"03",x"40",x"04",x"C1",x"D0",x"60",x"3F",x"00",x"09",x"71",x"C0",x"21",
x"40",x"06",x"06",x"90",x"10",x"F7",x"C0",x"20",x"3F",x"02",x"C0",x"60",x"3F",x"04",x"C0",x"A0",
x"3F",x"06",x"D0",x"E0",x"3F",x"01",x"13",x"05",x"D0",x"10",x"DC",x"40",x"06",x"02",x"16",x"FD",
x"10",x"03",x"DC",x"70",x"06",x"02",x"16",x"FD",x"04",x"5B",x"0D",x"0B",x"06",x"A0",x"40",x"B4",
x"0F",x"0B",x"C1",x"C7",x"13",x"16",x"04",x"C0",x"D0",x"20",x"60",x"04",x"0A",x"30",x"C0",x"C0",
x"04",x"C1",x"02",x"02",x"04",x"00",x"CC",x"01",x"06",x"02",x"16",x"FD",x"04",x"C0",x"D0",x"20",
x"41",x"4F",x"06",x"C0",x"0A",x"30",x"A0",x"03",x"0C",x"A0",x"41",x"AC",x"D8",x"20",x"41",x"4F",
x"B0",x"00",x"04",x"5B",x"D8",x"20",x"41",x"1A",x"3F",x"00",x"02",x"00",x"41",x"D4",x"C8",x"00",
x"3F",x"02",x"02",x"00",x"40",x"06",x"C8",x"00",x"3F",x"04",x"02",x"00",x"40",x"10",x"C8",x"00",
x"3F",x"06",x"04",x"5B",x"04",x"C7",x"D0",x"20",x"3F",x"01",x"13",x"13",x"C0",x"20",x"41",x"18",
x"06",x"00",x"0C",x"A0",x"41",x"50",x"02",x"04",x"00",x"05",x"02",x"05",x"3F",x"02",x"02",x"06",
x"41",x"40",x"8D",x"B5",x"16",x"03",x"06",x"04",x"16",x"FC",x"10",x"09",x"06",x"00",x"16",x"F1",
x"10",x"09",x"C0",x"20",x"3F",x"02",x"0C",x"A0",x"41",x"50",x"80",x"40",x"14",x"03",x"0C",x"A0",
x"41",x"98",x"05",x"47",x"D8",x"07",x"B0",x"00",x"04",x"5B",x"0D",x"0B",x"06",x"A0",x"40",x"B4",
x"0F",x"0B",x"C1",x"C7",x"13",x"04",x"C0",x"20",x"3F",x"0C",x"0C",x"A0",x"41",x"AC",x"04",x"5B",
x"05",x"00",x"00",x"00",x"00",x"00",x"00",x"00",x"00",x"00",x"00",x"00",x"02",x"00",x"41",x"10",
x"02",x"01",x"41",x"14",x"02",x"02",x"0B",x"00",x"03",x"A0",x"32",x"02",x"32",x"30",x"32",x"30",
x"32",x"30",x"02",x"02",x"00",x"07",x"36",x"31",x"06",x"02",x"16",x"FD",x"03",x"C0",x"0C",x"00",
x"20",x"20",x"20",x"20",x"20",x"20",x"20",x"20",x"20",x"20",x"00",x"00",x"00",x"00",x"00",x"00",
x"88",x"00",x"41",x"18",x"1A",x"03",x"C0",x"60",x"41",x"18",x"0C",x"00",x"0D",x"00",x"0A",x"40",
x"02",x"01",x"0B",x"00",x"A0",x"20",x"41",x"16",x"17",x"01",x"05",x"81",x"A0",x"60",x"41",x"14",
x"02",x"03",x"41",x"40",x"02",x"02",x"00",x"10",x"03",x"A0",x"32",x"01",x"06",x"C1",x"32",x"01",
x"32",x"00",x"06",x"C0",x"32",x"00",x"36",x"00",x"36",x"33",x"06",x"02",x"16",x"FD",x"03",x"C0",
x"0F",x"00",x"C0",x"60",x"41",x"18",x"0C",x"00",x"02",x"00",x"3F",x"00",x"02",x"01",x"41",x"40",
x"02",x"02",x"00",x"08",x"CC",x"31",x"06",x"02",x"16",x"FD",x"0C",x"00",x"02",x"01",x"41",x"4A",
x"D0",x"A0",x"41",x"4E",x"06",x"C2",x"D0",x"A0",x"41",x"4D",x"02",x"03",x"0B",x"00",x"03",x"A0",
x"32",x"03",x"32",x"31",x"32",x"31",x"32",x"31",x"36",x"01",x"36",x"30",x"06",x"02",x"16",x"FD",
x"03",x"C0",x"0C",x"00",x"03",x"40",
-- 470 bytes
others => (others => '0'));
-- The workspace registers are *real* in this implementation. :)
type regfile_type is array (0 to 15) of std_logic_vector(0 to 15);
signal regfile : regfile_type := (
x"0000",x"0000",x"0000",x"0000",x"0000",x"0000",x"0000",x"0000",
x"0000",x"0000",x"0000",x"0000",x"0000",x"0000",x"0000",x"0000");
-- Main FSM state control.
type cpu_state_type is (
st_cpu_idle, st_cpu_fetch, st_cpu_fetch_msb, st_cpu_fetch_lsb, st_cpu_latch_ir, st_cpu_decode,
st_cpu_resolve_src, st_cpu_save_src, st_cpu_resolve_dst, st_cpu_save_dst,
st_cpu_alu_op, st_cpu_alu_to_ws, st_cpu_alu_to_mem, st_cpu_alu_to_mem_lsb,
st_cpu_status, st_cpu_b_op,
st_cpu_x_op,
st_cpu_mpy_op, st_cpu_mpy_dst, st_cpu_mpy_wait, st_cpu_mpy_msb, st_cpu_mpy_done,
st_cpu_div_op, st_cpu_div_msb, st_cpu_div_wait, st_cpu_div_done,
st_cpu_shift_op, st_cpu_shift_count, st_cpu_shift_done,
st_cpu_spi_op, st_cpu_spi_wait,
st_cpu_pix_op, st_cpu_pix_set, st_cpu_pix_read, st_cpu_pix_write, st_cpu_pix_done,
st_cpu_load_immd, st_cpu_load_immd_msb, st_cpu_load_immd_lsb,
st_cpu_mem_wr, st_cpu_mem_wri, st_cpu_mem_wri_msb, st_cpu_mem_wri_lsb, st_cpu_mem_wri_done,
st_cpu_mem_sym, st_cpu_mem_sym_msb1, st_cpu_mem_sym_lsb1,
st_cpu_mem_sym_msb2, st_cpu_mem_sym_lsb2, st_cpu_mem_sym_done,
st_cpu_mem_idx, st_cpu_mem_idx_msb1, st_cpu_mem_idx_lsb1, st_cpu_mem_idx_ea,
st_cpu_mem_idx_msb2, st_cpu_mem_idx_lsb2, st_cpu_mem_idx_done
);
signal cpu_state : cpu_state_type;
signal cpu_state_hold : cpu_state_type;
signal cpu_state_return : cpu_state_type;
signal src_state_sel : cpu_state_type;
signal dst_state_sel : cpu_state_type;
signal cpu_state_alu_store : cpu_state_type;
signal cpu_state_t0 : cpu_state_type; -- after decode
signal cpu_state_t1 : cpu_state_type; -- after src
signal cpu_state_t2 : cpu_state_type; -- after dst
signal cpu_state_t3 : cpu_state_type; -- after alu
signal cpu_state_t4 : cpu_state_type; -- after store
signal hold : std_logic;
signal pause_req : std_logic;
signal pause_ack_reg : std_logic;
signal running_reg : std_logic;
signal bl_op_en : std_logic; -- '1' for branch and link operation
signal rtwp_en : std_logic; -- '1' for the RTWP instruction
signal jump_op_en : std_logic; -- '1' for a jump operation
signal immd_to_src : std_logic; -- '1' to assign an immediate value as the src vs dst
-- Decoder priority encoder.
type format_type is (format1, format2, format3, format4, format5, format6, format7);
signal format : format_type;
signal format_next : format_type;
-- ALU control.
type alu_ctrl_type is (
alu_mov, alu_add, alu_sub, alu_cmp, alu_coc, alu_inc, alu_inct,
alu_dec, alu_dect, alu_andi, alu_czc, alu_andn, alu_or, alu_xor,
alu_clr, alu_seto, alu_inv, alu_neg, alu_swpb, alu_abs, alu_div,
alu_shift
);
signal alu_ctrl : alu_ctrl_type;
signal alu_next : alu_ctrl_type;
-- Jump control.
type jump_ctrl_type is (
jump_jmp, jump_jlt, jump_jle, jump_jeq, jump_jhe, jump_jgt, jump_jne,
jump_jnc, jump_joc, jump_jno, jump_jl, jump_jh, jump_jop
);
signal jump_ctrl : jump_ctrl_type;
signal take_jump : std_logic;
-- Memory address register mux control.
type mar_ctrl_type is (ctrl_mar_pc, ctrl_mar_t1);
signal mar_ctrl : mar_ctrl_type;
-- GPU dedicated RAM
signal gaddr : std_logic_vector(0 to 10);
signal gdout : std_logic_vector(0 to 7);
signal gdin : std_logic_vector(0 to 7);
signal gwe_reg : std_logic;
signal gwe_next : std_logic;
-- Workspace register file
signal ws_addr : std_logic_vector(0 to 3) := "0000"; -- Default to keep simulation quiet
signal ws_dst : std_logic_vector(0 to 3); -- destination ws reg
signal ws_dout : std_logic_vector(0 to 15);
signal ws_dout_inc: std_logic_vector(0 to 15);
signal ws_din : std_logic_vector(0 to 15);
signal ws_din_mux : std_logic_vector(0 to 15);
signal ws_dst_save: std_logic_vector(0 to 15); -- original dst ws reg value for byte ops
signal ws_we : std_logic;
-- Stack support
signal ws_inc_flag : std_logic; -- '1' to inc ws reg, '0' to dec
signal ws_pre_flag : std_logic; -- '1' to pre inc/dec ws reg
signal stack_to_pc_en : std_logic; -- '1' to store the stack op to the PC (ret)
signal pc_to_stack_en : std_logic; -- '1' to store the PC on the stack (call)
-- Memory addressing
signal mar : std_logic_vector(0 to 15); -- memory address register
signal mar_sel : std_logic_vector(0 to 3);
signal mar_low4 : std_logic_vector(0 to 3);
signal mem_din : std_logic_vector(0 to 7); -- memory data in
signal mem_dout : std_logic_vector(0 to 7); -- memory data out
signal rdin_reg : std_logic_vector(0 to 7); -- VDP register read data in
signal gstatus_reg: std_logic_vector(0 to 6); -- 7-bits of user defined status
signal blank_reg : std_logic; -- register the blank input
-- Write enable registers for external memory
signal vwe_reg : std_logic; -- VRAM write enable register
signal rwe_reg : std_logic; -- VDP-register write enable register
signal pwe_reg : std_logic; -- palette write enable, word ops only
signal swe_reg : std_logic; -- gpu status write enable
signal vwe_next : std_logic;
signal rwe_next : std_logic;
signal pwe_next : std_logic;
signal swe_next : std_logic;
signal pc : std_logic_vector(0 to 15); -- program counter
signal pc_inc : std_logic_vector(0 to 15); -- program counter + 1
signal pc_jump : std_logic_vector(0 to 15); -- program counter + jump displacement
signal ir : std_logic_vector(0 to 15); -- instruction register
signal t1 : std_logic_vector(0 to 15); -- temp 1
signal t2 : std_logic_vector(0 to 15); -- temp 2
signal ea_t1t2 : std_logic_vector(0 to 15); -- t1 + t2
signal ea_src : std_logic_vector(0 to 15); -- effective address of the source
signal ea_dst : std_logic_vector(0 to 15); -- effective address of the destination
-- Opcode Decoding
signal byte_next : std_logic;
signal force_byte : std_logic; -- force a byte selector
signal byte : std_logic; -- byte selector
signal Td : std_logic_vector(0 to 1); -- destination mode: Td
signal D : std_logic_vector(0 to 3); -- destination: D or C
signal Ts : std_logic_vector(0 to 1); -- source mode: Ts
signal S : std_logic_vector(0 to 3); -- source: S or W
signal C : std_logic_vector(0 to 3); -- count: C
signal disp : std_logic_vector(0 to 7); -- signed jump displacement
signal Ts_next : std_logic_vector(0 to 1); -- provide a Ts override
signal S_next : std_logic_vector(0 to 3); -- provide a S override
signal Td_next : std_logic_vector(0 to 1); -- provide a Td override
signal D_next : std_logic_vector(0 to 3); -- provide a D override
-- Shifter
type shift_ctrl_type is (shift_sla, shift_slc, shift_srl, shift_sra, shift_src);
signal shift_ctrl : shift_ctrl_type;
signal shift_next : shift_ctrl_type;
signal shift_cnt : integer range 0 to 16;
signal shift_cnt_init : integer range 0 to 16;
signal shift_reg : std_logic_vector(0 to 15);
signal shift_carry : std_logic;
signal shift_dir : std_logic;
signal shift_bit : std_logic;
signal shift_load : std_logic;
signal shift_done : std_logic;
signal shift_msb : std_logic;
signal shift_overflow : std_logic;
-- ALU
signal alu_out : std_logic_vector(0 to 16); -- ALU result, 17-bit to include the carry bit
signal alu_reg : std_logic_vector(0 to 15); -- ALU result register
signal src_oper : std_logic_vector(0 to 15); -- source operand
signal alu_sa : std_logic_vector(0 to 15); -- source value to ALU
signal alu_sa_n : std_logic_vector(0 to 15); -- inverted source value to ALU
signal dst_oper : std_logic_vector(0 to 15); -- destination operand
signal alu_da : std_logic_vector(0 to 15); -- destination value to ALU
signal alu_to_ws : std_logic; -- write the ALU result to ws reg vs memory
signal alu_carry : std_logic; -- ALU carry bit
-- Multiply
signal mpy_out : std_logic_vector(0 to 31); -- 32-bit multiply result
-- signal mac_r, mac_x : std_logic_vector(0 to 31);-- 32-bit multiply-accumulate
-- signal mac_clr_r, mac_clr_x : std_logic; -- MAC clear
-- Divide
signal div_overflow : std_logic;
signal div_reset : std_logic;
signal div_start : std_logic;
signal div_done : std_logic;
signal div_rmd : std_logic_vector(0 to 15);
signal div_quo : std_logic_vector(0 to 15);
-- Source and destination flags
signal auto_inc : std_logic; -- auto increment destination
signal src_is_ws : std_logic; -- write src to ws reg vs memory
signal src_autoinc: std_logic; -- auto increment source
signal dst_is_ws : std_logic; -- write dst to ws reg vs memory
signal dst_autoinc: std_logic; -- auto increment destination
-- Equality tests for status flags
signal sa_eq_da : std_logic;
signal sa_msb_eq_da : std_logic;
signal sa_eq_8000 : std_logic;
signal sa_eq_zero : std_logic;
signal da_eq_zero : std_logic;
signal alu_eq_zero : std_logic;
signal alu_msb_eq_da : std_logic;
-- Status flags
signal LGT : std_logic := '0'; -- defaults to keep simulation quite
signal AGT : std_logic := '0';
signal EQUAL : std_logic := '0';
signal CARRY : std_logic := '0';
signal OVFLW : std_logic := '0';
signal PARITY : std_logic := '0';
signal LGT_next : std_logic := '0';
signal AGT_next : std_logic := '0';
signal EQUAL_next : std_logic := '0';
signal CARRY_next : std_logic := '0';
signal OVFLW_next : std_logic := '0';
signal PARITY_next : std_logic := '0';
signal status_sel : std_logic;
signal status_bits : std_logic_vector(0 to 5);
-- DMA data
signal dwe_r, dwe_x : std_logic;
signal dma_src_r, dma_src_x, dma_src_s : std_logic_vector(0 to 15);
signal dma_src_msb_r, dma_src_msb_x : std_logic_vector(0 to 7);
signal dma_src_lsb_r, dma_src_lsb_x : std_logic_vector(0 to 7);
signal dma_dst_r, dma_dst_x, dma_dst_s : std_logic_vector(0 to 15);
signal dma_dst_msb_r, dma_dst_msb_x : std_logic_vector(0 to 7);
signal dma_dst_lsb_r, dma_dst_lsb_x : std_logic_vector(0 to 7);
signal dma_w_r, dma_w_x : std_logic_vector(0 to 7);
signal dma_h_r, dma_h_x : std_logic_vector(0 to 7);
signal dma_stride_r, dma_stride_x : std_logic_vector(0 to 7);
signal dma_copy_r, dma_copy_x : std_logic; -- '0' for src -> dst copy, otherwise '1' for dst fill.
signal dma_inc_r, dma_inc_x : std_logic; -- '0' for address increment, otherwise '0' for decrement.
signal dma_step_s : std_logic_vector(0 to 15);
signal dma_w_minus_1_s : std_logic_vector(0 to 7);
signal dma_diff_r, dma_diff_x, dma_diff_s : std_logic_vector(0 to 7);
signal dma_diff_sign_s : std_logic_vector(0 to 7);
signal dma_w_cnt_r, dma_w_cnt_x : std_logic_vector(0 to 7);
signal dma_w_rst_r, dma_w_rst_x : std_logic_vector(0 to 7);
signal dma_h_cnt_r, dma_h_cnt_x : std_logic_vector(0 to 7);
signal dma_data_r, dma_data_x, dma_data_s : std_logic_vector(0 to 7);
-- DMA control
type dma_type is (DMA_IDLE, DMA_WAIT, DMA_SRC, DMA_DST);
signal dma_r, dma_x : dma_type;
signal dma_pause_ack_s : std_logic; -- '1' when the DMA acknowledges a CPU pause request
signal dma_active_s : std_logic; -- '1' when the DMA is active
signal dma_mar_s : std_logic; -- '1' when the VRAM MAR will use the DMA address
signal dma_we_s : std_logic;
signal dma_addr_s : std_logic_vector(0 to 15);
signal dma_pause_r, dma_pause_s : std_logic;
signal dma_trig_r, dma_trig_x : std_logic;
-- SPI
type spi_state_type is (st_spi_idle, st_spi_clk1, st_spi_clk0, st_spi_done);
signal spi_state : spi_state_type;
signal spi_cs_reg : std_logic;
signal spi_cs_next : std_logic;
signal spi_clk_reg : std_logic;
signal spi_counter : integer range 0 to 7;
signal spi_en : std_logic;
signal spi_done : std_logic;
signal spi_din : std_logic_vector(0 to 7);
signal spi_dout : std_logic_vector(0 to 7);
-- Bitmap layer calculations
signal wmul_zadj_s : std_logic;
signal wmul9bit : std_logic_vector(0 to 8);
signal wmul_x, wmul_r: std_logic_vector(0 to 7);
signal bml_yoff : std_logic_vector(0 to 15);
signal bml_addr : std_logic_vector(0 to 15);
signal gm2_addr : std_logic_vector(0 to 15);
signal pix_eq : std_logic;
signal pix_in : std_logic_vector(0 to 1);
signal pix_out : std_logic_vector(0 to 7);
begin
-- Bitmap layer pixel address calculation
-- src_oper contains the x,y location, t2 is same as src_oper
-- ws_dout contains the options and new pixel
-- x
-- y
-- w = width of bitmap in pixels
-- wmul = y multiplier
-- wmul = (w + 3) >> 2 The '3' is because w + 3 == w - 1 + 4
-- byte = (y * wmul) + (x >> 2)
-- pixel index in byte = x & 0x03
wmul_zadj_s <= '1' when bml_w = 0 else '0';
wmul9bit <= (wmul_zadj_s & bml_w) + 3; -- w + 3
wmul_x <= '0' & wmul9bit(0 to 6); -- divide by 4 and reduce to 8-bit
process (clk) begin if rising_edge(clk) then
wmul_r <= wmul_x;
-- using t2 to break a link between the multiplier and mem_din via src_oper.
bml_yoff <= t2(8 to 15) * wmul_r; -- y_offset from base address, 8x8x16 multiplier
end if; end process;
-- Keep the address in the VRAM.
bml_addr <= "00" & ((bmlba & "000000") + bml_yoff(2 to 15) + ("00000000" & src_oper(0 to 5)));
-- ws_dout
-- 01234567 89012345
-- MAxxRWCE xxOOxxPP
-- Mix the new pixel with the existing data. There are two reasons
-- vdin is used over the mem_din mux. The first, and most important,
-- is to prevent a long setup constraint caused when mem_din was used.
-- There are too many unrelated signals coming in to the mem_din mux.
-- However, using vdin for pix_in/pix_out (and pix_eq) consumes about
-- 2% more slices than using mem_din... The other reason is, the PIX
-- instruction really only works with VRAM, which does *NOT* include
-- the GPU RAM.
process (src_oper, vdin, ws_dout) begin
case src_oper(6 to 7) is
when "00" => pix_in <= vdin(0 to 1);
pix_out <= ws_dout(14 to 15) & vdin(2 to 7);
if ws_dout(10 to 11) = vdin(0 to 1) then
pix_eq <= ws_dout(6) and ws_dout(7); else
pix_eq <= ws_dout(6) and (not ws_dout(7)); end if;
when "01" => pix_in <= vdin(2 to 3);
pix_out <= vdin(0 to 1) & ws_dout(14 to 15) & vdin(4 to 7);
if ws_dout(10 to 11) = vdin(2 to 3) then
pix_eq <= ws_dout(6) and ws_dout(7); else
pix_eq <= ws_dout(6) and (not ws_dout(7)); end if;
when "10" => pix_in <= vdin(4 to 5);
pix_out <= vdin(0 to 3) & ws_dout(14 to 15) & vdin(6 to 7);
if ws_dout(10 to 11) = vdin(4 to 5) then
pix_eq <= ws_dout(6) and ws_dout(7); else
pix_eq <= ws_dout(6) and (not ws_dout(7)); end if;
when "11" => pix_in <= vdin(6 to 7);
pix_out <= vdin(0 to 5) & ws_dout(14 to 15);
if ws_dout(10 to 11) = vdin(6 to 7) then
pix_eq <= ws_dout(6) and ws_dout(7); else
pix_eq <= ws_dout(6) and (not ws_dout(7)); end if;
when others => null;
end case; end process;
-- Calculate the original GM2 byte based on x,y coords. Only the MSb of the
-- pattern base from VR4 is used, so the table starts at 0K or 8K.
-- src_oper contains the x,y values
--
-- 0 1 2 3 4 5 6 7| 8 9 10 11 12 13 14 15
-- X0 X1 X2 X3 X4 X5 X6 X7|Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
--
gm2_addr <= "00" & (
(pgba & src_oper(8 to 12) & "00000" & src_oper(13 to 15)) + -- y / 8 * 256 + (y % 8)
("0000" & src_oper(0 to 4) & "000")); -- + (x AND >F8) (mask out the pixel index bits)
-- SPI
-- Always 8-bits
-- 50MHz (clk/2)
-- CKON, CKOF instructions control the CS line
-- LDCR write 1 byte to the SPI
-- STCR reads 1 byte from the SPI
-- Holds the GPU until SPI is done (160ns due to 50MHz clk)
-- Not affected by CPU pause request
spi_cs <= spi_cs_reg;
spi_clk <= spi_clk_reg;
-- The output data is always the MSb of the data.
spi_mosi <= 'Z' when spi_state = st_spi_idle else spi_dout(0);
process (clk) begin if rising_edge(clk) then
if rst_n = '0' then
spi_state <= st_spi_idle;
spi_clk_reg <= '0';
spi_done <= '0';
else
case spi_state is
when st_spi_idle =>
spi_state <= st_spi_idle;
spi_done <= '0';
spi_clk_reg <= '0';
spi_counter <= 0;
spi_dout <= src_oper(0 to 7);
if spi_en = '1' then
spi_state <= st_spi_clk1;
end if;
when st_spi_clk1 =>
spi_state <= st_spi_clk0;
spi_clk_reg <= '1';
-- Read data in on the rising edge.
spi_din <= spi_din(1 to 7) & spi_miso;
-- Count and test for done.
spi_counter <= spi_counter + 1;
if spi_counter = 7 then
spi_state <= st_spi_done;
spi_done <= '1';
end if;
when st_spi_clk0 =>
spi_state <= st_spi_clk1;
spi_clk_reg <= '0';
-- Change the data out on the falling edge.
spi_dout <= spi_dout(1 to 7) & '0';
when st_spi_done =>
-- Wait for the GPU to take down the spi_en flag.
spi_state <= st_spi_done;
spi_clk_reg <= '0';
if spi_en = '0' then
spi_state <= st_spi_idle;
end if;
end case;
end if;
end if; end process;
-- DMA
-- 8xx0 - MSB src
-- 8xx1 - LSB src
-- 8xx2 - MSB dst
-- 8xx3 - LSB dst
-- 8xx4 - width
-- 8xx5 - height
-- 8xx6 - stride
-- 8xx7 - 0..5 | !INC/DEC | !COPY/FILL
-- 8xx8 - trigger
--
-- src, dst, width, height, stride are copied to dedicated counters when
-- the DMA is triggered, thus the original values remain unchanged.
-- Write access to the DMA registers.
process (mar, mem_dout, dwe_r, dma_r, dma_trig_r,
dma_src_msb_r, dma_src_lsb_r, dma_dst_msb_r, dma_dst_lsb_r, dma_w_r, dma_h_r,
dma_stride_r, dma_copy_r, dma_inc_r)
begin
dma_src_msb_x <= dma_src_msb_r;
dma_src_lsb_x <= dma_src_lsb_r;
dma_dst_msb_x <= dma_dst_msb_r;
dma_dst_lsb_x <= dma_dst_lsb_r;
dma_w_x <= dma_w_r;
dma_h_x <= dma_h_r;
dma_stride_x <= dma_stride_r;
dma_copy_x <= dma_copy_r;
dma_inc_x <= dma_inc_r;
dma_trig_x <= '0';
if dwe_r = '1' then
case mar(12 to 15) is
when x"0" => dma_src_msb_x <= mem_dout;
when x"1" => dma_src_lsb_x <= mem_dout;
when x"2" => dma_dst_msb_x <= mem_dout;
when x"3" => dma_dst_lsb_x <= mem_dout;
when x"4" => dma_w_x <= mem_dout;
when x"5" => dma_h_x <= mem_dout;
when x"6" => dma_stride_x <= mem_dout;
when x"7" => dma_inc_x <= mem_dout(6);
dma_copy_x <= mem_dout(7);
when x"8" => dma_trig_x <= '1';
when others => null;
end case;
end if;
end process;
process (clk) begin if rising_edge(clk) then
dma_src_msb_r <= dma_src_msb_x;
dma_src_lsb_r <= dma_src_lsb_x;
dma_dst_msb_r <= dma_dst_msb_x;
dma_dst_lsb_r <= dma_dst_lsb_x;
dma_w_r <= dma_w_x;
dma_h_r <= dma_h_x;
dma_stride_r <= dma_stride_x;
dma_copy_r <= dma_copy_x;
dma_inc_r <= dma_inc_x;
dma_trig_r <= dma_trig_x;
end if; end process;
-- +1 or -1 depending on the !INC/DEC flag.
dma_step_s <= x"0001" when dma_inc_r = '0' else x"FFFF";
-- Calculate stride-(w-1) for positive direction or (w-1)-stride for negative.
dma_w_minus_1_s <= dma_w_r - 1;
with dma_inc_r select
dma_diff_s <=
(dma_stride_r - dma_w_minus_1_s) when '0',
(dma_w_minus_1_s - dma_stride_r) when others;
-- Sign extend the step value.
dma_diff_sign_s <=
dma_diff_r(0) & dma_diff_r(0) & dma_diff_r(0) & dma_diff_r(0) &
dma_diff_r(0) & dma_diff_r(0) & dma_diff_r(0) & dma_diff_r(0);
process ( dma_w_cnt_r, dma_step_s, dma_src_r, dma_dst_r, dma_diff_sign_s, dma_diff_r )
begin
dma_src_s <= dma_src_r + dma_step_s;
dma_dst_s <= dma_dst_r + dma_step_s;
-- When the width counter is 1, add the stride difference.
if dma_w_cnt_r = 1 then
dma_src_s <= dma_src_r + (dma_diff_sign_s & dma_diff_r);
dma_dst_s <= dma_dst_r + (dma_diff_sign_s & dma_diff_r);
end if;
end process;
-- DMA address select.
dma_addr_s <= dma_dst_r when dma_r = DMA_DST else dma_src_r;
-- When copying source to destination, tie the VRAM input
-- directly back out to the output to allow the fastest
-- two-cycle read/write.
dma_data_s <= vdin when dma_copy_r = '0' else dma_data_r;
dma_we_s <= '1' when dma_r = DMA_DST else '0';
-- DMA FSM
-- DMA is limited to 16K VRAM to avoid mem_din mux.
process (vdin, dma_data_r, pause_ack_reg,
dma_r, dma_trig_r, dma_pause_s, dma_copy_r, dma_src_r, dma_dst_r,
dma_src_msb_r, dma_src_lsb_r, dma_dst_msb_r, dma_dst_lsb_r,
dma_w_r, dma_h_r, dma_w_cnt_r, dma_w_rst_r, dma_h_cnt_r,
dma_diff_r, dma_diff_s, dma_src_s, dma_dst_s)
begin
dma_x <= dma_r;
dma_src_x <= dma_src_r;
dma_dst_x <= dma_dst_r;
dma_w_cnt_x <= dma_w_cnt_r;
dma_w_rst_x <= dma_w_rst_r;
dma_h_cnt_x <= dma_h_cnt_r;
dma_diff_x <= dma_diff_r;
dma_data_x <= dma_data_r;
dma_active_s <= '1';
dma_mar_s <= '1';
dma_pause_ack_s <= '0';
case dma_r is
when DMA_IDLE =>
-- Load when the DMA is idle.
dma_src_x <= dma_src_msb_r & dma_src_lsb_r;
dma_dst_x <= dma_dst_msb_r & dma_dst_lsb_r;
dma_w_cnt_x <= dma_w_r;
dma_w_rst_x <= dma_w_r;
dma_h_cnt_x <= dma_h_r;
dma_diff_x <= dma_diff_s;
dma_mar_s <= '0'; -- '0' until the GPU acknowledges it has paused
if dma_trig_r = '1' then
dma_x <= DMA_WAIT;
else
dma_active_s <= '0';
dma_pause_ack_s <= '1';
end if;
when DMA_WAIT =>
if pause_ack_reg = '1' then
dma_x <= DMA_SRC;
else
dma_mar_s <= '0'; -- '0' until the GPU acknowledges it has paused
end if;
when DMA_SRC =>
-- Pausing must happen during the source state.
if dma_pause_s = '1' then
dma_pause_ack_s <= '1';
else
dma_src_x <= dma_src_s;
dma_x <= DMA_DST;
dma_data_x <= vdin; -- Save the source byte for fill only operations.
end if;
when DMA_DST =>
dma_dst_x <= dma_dst_s;
if dma_w_cnt_r = 1 then
dma_w_cnt_x <= dma_w_rst_r;
dma_h_cnt_x <= dma_h_cnt_r - 1;
else
dma_w_cnt_x <= dma_w_cnt_r - 1;
end if;
if dma_h_cnt_r = 1 and dma_w_cnt_r = 1 then
dma_x <= DMA_IDLE;
else
if dma_copy_r = '0' then
dma_x <= DMA_SRC;
else
dma_x <= DMA_DST;
-- Pausing is fine during the destination state if the source
-- is a fixed value, otherwise the pause acknowledge will be
-- delayed until the source state.
if dma_pause_s = '1' then
dma_pause_ack_s <= '1';
end if;
end if;
end if;
end case;
end process;
-- Delay the release of the pause signal for one extra clock cycle
-- to allow the DMA to reassert any addressing it was doing when
-- it was paused.
dma_pause_s <= pause or dma_pause_r;
process (clk) begin if rising_edge(clk) then
-- Pause is always transferred.
dma_pause_r <= pause;
if dma_pause_ack_s = '0' then
dma_r <= dma_x;
dma_src_r <= dma_src_x;
dma_dst_r <= dma_dst_x;
dma_w_cnt_r <= dma_w_cnt_x;
dma_w_rst_r <= dma_w_rst_x;
dma_h_cnt_r <= dma_h_cnt_x;
dma_diff_r <= dma_diff_x;
dma_data_r <= dma_data_x;
end if;
end if;
end process;
-- GPU RAM
process (clk) begin if rising_edge(clk) then
gdout <= gpuram(to_integer(unsigned(gaddr)));
if gwe_reg = '1' then
gpuram(to_integer(unsigned(gaddr))) <= gdin;
end if;
end if;
end process;
-- Workspace Register File as distributed RAM
process (clk) begin
if rising_edge(clk) then
if ws_we = '1' then
regfile(to_integer(unsigned(ws_addr))) <= ws_din;
end if;
end if;
end process;
-- Infer distributed RAM by reading asynchronously.
ws_dout <= regfile(to_integer(unsigned(ws_addr)));
ws_dout_inc <=
ws_dout + 1 when byte = '1' else
ws_dout + 2 when ws_inc_flag = '1' else
ws_dout - 2;
ws_din <= t1;
-- Workspace register file data input mux.
ws_din_mux <= alu_reg when byte = '0' else (alu_reg(0 to 7) & ws_dst_save(8 to 15));
-- MAR (Memory Address Register) MUX
process (mar_ctrl, dma_mar_s, dma_addr_s, pc, t1) begin
if dma_mar_s = '1' then
mar <= dma_addr_s;
else
case mar_ctrl is
when ctrl_mar_pc => mar <= pc;
when ctrl_mar_t1 => mar <= t1;
end case;
end if;
end process;
-- Address building
-- VRAM 14-bit, 16K @ >0000 to >3FFF (0011 1111 1111 1111)
-- GRAM 11-bit, 2K @ >4000 to >47FF (0100 x111 1111 1111)
-- PRAM 7-bit, 128 @ >5000 to >5x7F (0101 xxxx x111 1111)
-- VREG 6-bit, 64 @ >6000 to >6x3F (0110 xxxx xx11 1111)
-- current scanline @ >7000 to >7xx0 (0111 xxxx xxxx xxx0)
-- blanking @ >7001 to >7xx1 (0111 xxxx xxxx xxx1)
-- DMA @ >8000 to >8xx7 (1000 xxxx xxxx 0111)
-- MAC @ >9000 to >9003 (1001 xxxx xxxx xx11)
-- F18A version @ >A000 to >Axxx (1010 xxxx xxxx xxxx)
-- GPU status data @ >B000 to >Bxxx (1011 xxxx xxxx xxxx)
vaddr <= mar(2 to 15); -- Instruction addressing is only from VRAM and GRAM
gaddr <= mar(5 to 15); -- Instruction addressing is only from VRAM and GRAM
paddr <= ea_dst(9 to 14); -- Palette access is always the real address, word aligned
raddr <= t1(10 to 15) & mem_dout; -- Register addressing will always be T1 and never the PC
vdout <= mem_dout when dma_mar_s = '0' else dma_data_s;
gdin <= mem_dout;
pdout <= alu_reg(4 to 15); -- Palette RAM is 12-bit ----rrrrggggbbbb
gstatus <= gstatus_reg; -- 7-bits of user defined status
process (clk) begin if rising_edge(clk) then
if swe_reg = '1' then gstatus_reg <= mem_dout(1 to 7); end if;
end if; end process;
-- Registered write enables to prevent glitches from
-- causing sporadic writes.
vwe <= vwe_reg when dma_mar_s = '0' else dma_we_s; -- VRAM
pwe <= pwe_reg; -- Palette RAM
rwe <= rwe_reg; -- VDP Registers
-- Memory write enable selection based on the stored
-- destination effective address.
process (ea_dst)
begin
vwe_next <= '0';
gwe_next <= '0';
pwe_next <= '0';
rwe_next <= '0';
dwe_x <= '0';
swe_next <= '0';
-- mac_clr_x <= '0';
case ea_dst(0 to 3) is
when x"0" |
x"1" |
x"2" |
x"3" => vwe_next <= '1'; -- VRAM
when x"4" => gwe_next <= '1'; -- GRAM (local, private)
when x"5" => pwe_next <= '1'; -- Palette RAM
when x"6" => rwe_next <= '1'; -- VDP Registers
-- x"7" is the current scan line and is read-only
when x"8" => dwe_x <= '1'; -- DMA registers
-- when x"9" => mac_clr_x <= '1'; -- MAC clear
-- x"A" F18A version and is read-only
when x"B" => swe_next <= '1'; -- 7-bits of user defined status
when others => null;
end case;
end process;
-- Data In and write enable selector
-- Selection is 1-cycle behind the current address since the data
-- visible in the current state was addressed in the previous state.
process (clk) begin if rising_edge(clk) then
mar_sel <= mar(0 to 3);
mar_low4 <= mar(12 to 15);
rdin_reg <= rdin;
blank_reg <= blank; -- register to break long path delay
end if; end process;
process (mar_sel, mar_low4, vdin, gdout, pdin, rdin_reg, blank_reg, scanline, --version, --mac_r,
dma_src_msb_r, dma_src_lsb_r, dma_dst_msb_r, dma_dst_lsb_r, dma_w_r, dma_h_r, dma_stride_r,
dma_inc_r, dma_copy_r)
begin
mem_din <= (others => '0');
case mar_sel is
when x"0" |
x"1" |
x"2" |
x"3" => mem_din <= vdin; -- VRAM
when x"4" => mem_din <= gdout; -- GRAM (local, private)
when x"5" => -- Palette RAM
-- Return the MSB or LSB of the palette register depending
-- on which byte is being addressed. Palette access only
-- works correctly with word instructions.
if mar_low4(3) = '0' then
mem_din <= "0000" & pdin(0 to 3); else
mem_din <= pdin(4 to 11); end if;
when x"6" => mem_din <= rdin_reg; -- register read! :-)
when x"7" =>
if mar_low4(3) = '0' then
mem_din <= scanline; else -- current scan line (y raster)
mem_din <= "0000000" & blank_reg; end if;
when x"8" => -- DMA
case mar_low4 is
when x"0" => mem_din <= dma_src_msb_r;
when x"1" => mem_din <= dma_src_lsb_r;
when x"2" => mem_din <= dma_dst_msb_r;
when x"3" => mem_din <= dma_dst_lsb_r;
when x"4" => mem_din <= dma_w_r;
when x"5" => mem_din <= dma_h_r;
when x"6" => mem_din <= dma_stride_r;
when x"7" => mem_din <= ("000000" & dma_inc_r & dma_copy_r);
when others => null;
end case;
-- when x"9" => -- MAC
-- case mar_low4 is
-- when x"0" => mem_din <= mac_r(0 to 7); -- MSB
-- when x"1" => mem_din <= mac_r(8 to 15);
-- when x"2" => mem_din <= mac_r(16 to 23);
-- when x"3" => mem_din <= mac_r(24 to 31);
-- when others => mem_din <= x"00";
-- end case;
when x"A" => mem_din <= VMAJOR & VMINOR; -- version; -- F18A version
when others => null;
end case;
end process;
-- 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |
-- ---------------------------------------------------------------+
-- 1 arith 1 |opcode | B | Td | D | Ts | S |
-- 2 arith 0 1 |opc| B | Td | D | Ts | S |
-- 3 math 0 0 1 | --opcode- | D or C | Ts | S |
-- 4 jump 0 0 0 1 | ----opcode--- | signed displacement |
-- 5 shift 0 0 0 0 1 | --opcode- | C | W |
-- 5 stack* 0 0 0 0 1 | 1 ------opcode--- | Ts/Td | S/D |
-- 6 pgm 0 0 0 0 0 1 | ----opcode--- | Ts | S |
-- 7 ctrl 0 0 0 0 0 0 1 | ----opcode--- | not used |
-- 7 ctrl 0 0 0 0 0 0 1 | opcode & immd | X | W |
--
-- The stack format is new for added opcodes. The original four shift
-- opcodes have a '0' in bit-5, but have 3-bits for the instruction
-- selection. So, using bit-5 as a '1' allows detection of the new
-- instructions and modifies the remaining bits to specify the src or
-- dst of the operation, since the stack always works with R15.
-- The Win994a simulator extensions for memory paging and stack. Most
-- go overboard, like all the different PUSH and POP instructions, but
-- where possible (and makes sense) the same opcodes are used.
--
-- 0780
-- 0DC0