-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSIMT.hs
2253 lines (2003 loc) · 88.4 KB
/
SIMT.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
module Pebbles.Pipeline.SIMT
( -- Pipeline configuration
SIMTPipelineConfig(..)
-- Pipeline inputs and outputs
, SIMTPipelineIns(..)
, SIMTPipelineOuts(..)
-- Instruction info for multi-cycle instructions
, SIMTPipelineInstrInfo(..)
-- Pipeline module
, makeSIMTPipeline
) where
-- Simple 32-bit SIMT pipeline with a configurable number of warps and
-- warp size.
--
-- There are 7 pipeline stages:
--
-- 0. Warp Scheduling
-- 1. Active Thread Selection (consists of 2 sub-stages)
-- 2. Instruction Fetch
-- 3. Operand Fetch
-- 4. Operand Latch (one or more sub-stages)
-- 5. Execute (& Thread Suspension)
-- 6. Writeback (& Thread Resumption)
--
-- Instructions are suspended in the Execute stage if they cannot
-- complete within a clock cycle. Suspended instructions are resumed
-- in the Writeback stage. Resumptions can be out-of-order with
-- respect to suspensions, and there is no requirement for threads in
-- a warp to resume at the same time. This flexibility is achieved by
-- maininting a "suspension bit" register for every thread.
-- Currently, if a warp is scheduled that contains any suspended
-- threads, then a bubble passes through the pipeline and the warp
-- will be tried again later. In future, we might attempt to avoid
-- scheduling a warp that contains a suspended thread.
--
-- Other considerations for future: (1) give retried instructions a
-- higher priority than new instructions in the warp scheduler; (2)
-- only pay cost of active thread selection on a branch/jump.
-- SoC configuration
#include <Config.h>
-- Blarney imports
import Blarney
import Blarney.Queue
import Blarney.Option
import Blarney.Stream
import Blarney.BitScan
import Blarney.PulseWire
import Blarney.SourceSink
import Blarney.TaggedUnion
import Blarney.QuadPortRAM
import Blarney.TypeFamilies
import Blarney.Interconnect
import Blarney.Vector qualified as V
import Blarney.Vector (Vec, fromList, toList)
-- General imports
import Data.List
import Data.Proxy
import Data.Maybe
import qualified Data.Map as Map
import Control.Applicative hiding (some)
-- Pebbles imports
import Pebbles.Util.List
import Pebbles.Util.Counter
import Pebbles.Pipeline.Interface
import Pebbles.Pipeline.SIMT.Management
import Pebbles.Pipeline.SIMT.RegFile
import Pebbles.Memory.Interface
import Pebbles.Memory.DRAM.Interface
import Pebbles.Memory.CoalescingUnit
import Pebbles.CSRs.TrapCodes
import Pebbles.CSRs.Custom.SIMTDevice
-- CHERI imports
import CHERI.CapLib
-- | Info about multi-cycle instructions issued by pipeline
data SIMTPipelineInstrInfo =
SIMTPipelineInstrInfo {
destReg :: RegId
-- ^ Destination register
, warpId :: Bit SIMTLogWarps
-- ^ Warp that issued the instruction
, regFileId :: RegFileId
-- ^ Destination register file
}
deriving (Generic, Interface, Bits)
-- | SIMT pipeline configuration
data SIMTPipelineConfig tag =
SIMTPipelineConfig {
instrMemInitFile :: Maybe String
-- ^ Instruction memory initialisation file
, instrMemLogNumInstrs :: Int
-- ^ Instuction memory size (in number of instructions)
, instrMemBase :: Integer
-- ^ Base address of instruction memory in memory map
, enableStatCounters :: Bool
-- ^ Are stat counters enabled?
, checkPCCFunc :: Maybe (Cap -> [(Bit 1, TrapCode)])
-- ^ When CHERI is enabled, function to check PCC
, useSharedPCC :: Bool
-- ^ When CHERI enabled, use shared PCC (meta-data) per kernel
, decodeStage :: [(String, tag)]
-- ^ Decode table
, executeStage :: [State -> Module ExecuteStage]
-- ^ List of execute stages, one per lane
-- The size of this list must match the warp size
, simtPushTag :: tag
, simtPopTag :: tag
-- ^ Mnemonics for SIMT explicit convergence instructions
, useRegFileScalarisation :: Bool
-- ^ Use scalarising register file?
, useAffineScalarisation :: Bool
-- ^ Use affine scalarisation, or plain uniform scalarisation?
, useCapRegFileScalarisation :: Bool
-- ^ Use scalarising register file for capabilities?
, useScalarUnit :: Bool
-- ^ Use dedicated scalar unit for parallel scalar/vector execution?
, scalarUnitAllowList :: [tag]
-- ^ A list of instructions that can execute on the scalar unit
, scalarUnitAffineAdd :: Maybe tag
-- ^ Optionally replace add instr with built-in affine add instr
, scalarUnitAffineCMove :: Maybe tag
-- ^ Optionally replace cmove instr with built-in affine version
, scalarUnitAffineCIncOffset :: Maybe tag
-- ^ Optionally replace cincoffset instr with built-in affine version
, scalarUnitDecodeStage :: [(String, tag)]
-- ^ Decode table for scalar unit
, scalarUnitExecuteStage :: State -> Module ExecuteStage
-- ^ Execute stage for scalar unit
, regSpillBaseAddr :: Integer
-- ^ Base address of register spill region in DRAM
, useLRUSpill :: Bool
-- ^ Prefer to spill registers that are not recently used
, useRRSpill :: Bool
-- ^ Round robin spill strategy
, useSharedVectorScratchpad :: Bool
-- ^ Share vector scatchpad between int and cap reg files?
, usesCapA :: [tag]
-- ^ Instructions that use cap meta-data of register operand A
, usesCapB :: MnemonicVec -> Bit 32 -> Bit 1
-- ^ Function to determine if instruction use cap meta-data of operand B
, shareCapSRFPort :: Bool
}
-- | SIMT pipeline inputs
data SIMTPipelineIns =
SIMTPipelineIns {
simtMgmtReqs :: Stream SIMTReq
-- ^ Stream of pipeline management requests
, simtWarpCmdWire :: Wire WarpCmd
-- ^ When this wire is active, the warp currently in the execute
-- stage (assumed to be converged) is issuing a warp command
, simtResumeReqs :: Stream (SIMTPipelineInstrInfo,
Vec SIMTLanes (Option ResumeReq))
-- ^ Resume requests for multi-cycle instructions (vector pipeline)
, simtScalarResumeReqs :: Stream (SIMTPipelineInstrInfo, ResumeReq)
-- ^ Resume requests for multi-cycle instructions (scalar pipeline)
, simtDRAMStatSigs :: DRAMStatSigs
-- ^ For DRAM stat counters
, simtMemReqs :: Vec SIMTLanes (Sink MemReq)
-- ^ Memory request path
, simtCoalStats :: CoalUnitPerfStats
-- ^ For coalescing unit stat counters
}
-- | SIMT pipeline outputs
data SIMTPipelineOuts =
SIMTPipelineOuts {
simtMgmtResps :: Stream SIMTResp
-- ^ Stream of pipeline management responses
, simtCurrentWarpId :: Bit 32
-- ^ Warp id of instruction currently in execute stage
, simtKernelAddr :: Bit 32
-- ^ Address of kernel closure as set by CPU
, simtInstrInfo :: SIMTPipelineInstrInfo
-- ^ Info for instruction currently in execute stage (vector pipeline)
, simtScalarInstrInfo :: SIMTPipelineInstrInfo
-- ^ Info for instruction currently in execute stage (scalar pipeline)
, simtScalarisedOpB :: ScalarisedOperand
-- ^ Scalarised operand B of instruction currently in vector
-- pipeline's execute stage
, simtOpcode :: MnemonicVec
-- ^ Opcode for instruction currently in execute stage (vector pipeline)
}
-- | Per-thread state
data SIMTThreadState =
SIMTThreadState {
simtPC :: Bit 32
-- ^ Program counter
, simtNestLevel :: Bit SIMTLogMaxNestLevel
-- ^ SIMT divergence nesting level
, simtRetry :: Bit 1
-- ^ The last thing this thread did was a retry
}
deriving (Generic, Bits, Interface)
-- | SIMT pipeline module
makeSIMTPipeline :: Tag tag =>
SIMTPipelineConfig tag
-- ^ SIMT configuration options
-> SIMTPipelineIns
-- ^ SIMT pipeline inputs
-> Module SIMTPipelineOuts
-- ^ SIMT pipeline outputs
makeSIMTPipeline c inputs =
-- Lift some parameters to the type level
liftNat (c.instrMemLogNumInstrs) \(_ :: Proxy t_logInstrs) -> do
-- Sanity check
staticAssert (SIMTLanes == genericLength c.executeStage)
"makeSIMTPipeline: warp size does not match number of execute units"
-- Is CHERI enabled?
let enableCHERI = isJust c.checkPCCFunc
-- Is dynamic register file spilling logic enabled?
-- See Note [Dynamic register spilling]
let enableRegSpill = SIMTRegFileSize < SIMTWarps*32
let enableCapSpill = enableCHERI && SIMTCapRegFileSize < SIMTWarps*32
let enableSpill = enableRegSpill || enableCapSpill
-- Compute field selector functions from decode table
let selMap = matchSel (c.decodeStage)
-- Functions for extracting register ids from an instruction
let srcA :: Instr -> RegId = getBitFieldSel selMap "rs1"
let srcB :: Instr -> RegId = getBitFieldSel selMap "rs2"
let dst :: Instr -> RegId = getBitFieldSel selMap "rd"
-- Queue of active warps for vector pipeline
warpQueue :: [Reg (Bit 1)] <- replicateM SIMTWarps (makeReg false)
-- Track number of warps in scalar pipeline
-- (Max val set to half num warps for load balancing)
scalarUnitWarpCount :: Counter SIMTLogWarps <-
makeCounter (fromInteger (SIMTWarps `div` 2))
-- One block RAM of thread states per lane
(stateMemsA, stateMemsB) :: ([RAM (Bit SIMTLogWarps) SIMTThreadState],
[RAM (Bit SIMTLogWarps) SIMTThreadState]) <-
unzip <$> replicateM SIMTLanes makeQuadRAM
-- One program counter capability RAM (meta-data only) per lane
(pccMemsA, pccMemsB) :: ([RAM (Bit SIMTLogWarps) CapMemMeta],
[RAM (Bit SIMTLogWarps) CapMemMeta]) <-
unzip <$> (replicateM SIMTLanes $
if enableCHERI && not c.useSharedPCC
then makeQuadRAM
else return (nullRAM, nullRAM))
-- Instruction memory
(instrMemA, instrMemB) ::
(RAM (Bit t_logInstrs) Instr, RAM (Bit t_logInstrs) Instr) <-
makeQuadRAMCore c.instrMemInitFile
-- Suspension bit for each thread
suspBits :: [[Reg (Bit 1)]] <-
replicateM SIMTLanes (replicateM SIMTWarps (makeReg false))
-- Register file load latency (for vector pipeline)
let loadLatency =
if c.useCapRegFileScalarisation || c.useRegFileScalarisation
then simtScalarisingRegFile_loadLatency
else if enableCHERI then 2 else 1
-- Is the pipeline active?
pipelineActive :: Reg (Bit 1) <- makeReg false
-- Register file
regFile :: SIMTRegFile (Log2Ceil SIMTRegFileSize) 33 <-
if c.useRegFileScalarisation
then makeSIMTScalarisingRegFile
SIMTScalarisingRegFileConfig {
useAffine = c.useAffineScalarisation
, useScalarUnit = c.useScalarUnit
, regInitVal = 0
, size = SIMTRegFileSize
, useDynRegSpill =
SIMTRegFileSize < SIMTWarps * 32
, useSharedVecSpad = Nothing
, pipelineActive = pipelineActive.val
, useInitValOpt = False
, sharePortB = False
}
else makeSIMTRegFile
SIMTRegFileConfig {
loadLatency = loadLatency
, regInitVal = Nothing
}
-- Capability register file (meta-data only)
capRegFile :: SIMTRegFile (Log2Ceil SIMTCapRegFileSize) CapMemMetaWidth <-
if enableCHERI
then
if c.useCapRegFileScalarisation
then makeSIMTScalarisingRegFile
SIMTScalarisingRegFileConfig {
useAffine = False
, useScalarUnit = c.useScalarUnit
, regInitVal = nullCapMemMetaVal
, size = SIMTCapRegFileSize
, useDynRegSpill =
SIMTCapRegFileSize < SIMTWarps * 32
, useSharedVecSpad =
#if SIMTUseSharedVecScratchpad
Just regFile.sharedVecSpad
#else
Nothing
#endif
, pipelineActive = pipelineActive.val
, useInitValOpt = SIMTCapRFUseInitValOpt == 1
, sharePortB = c.shareCapSRFPort
}
else makeSIMTRegFile
SIMTRegFileConfig {
loadLatency = loadLatency
, regInitVal = Just nullCapMemMetaVal
}
else makeNullSIMTRegFile
-- Scalar prediction table: for each instruction in the
-- instruction memory, was the instruction scalarisable the
-- last time it was executed?
(scalarTableA, scalarTableB) ::
(RAM (Bit t_logInstrs) (Bit 1),
RAM (Bit t_logInstrs) (Bit 1)) <-
if c.useScalarUnit then makeQuadRAM else return (nullRAM, nullRAM)
-- Barrier bit for each warp
barrierBits :: [Reg (Bit 1)] <- replicateM SIMTWarps (makeReg 0)
-- Count of number of warps in a barrier
-- (Not read when all warps in barrier, so overflow not a problem)
barrierCount :: Counter SIMTLogWarps <- makeCounter dontCare
-- Trigger for each stage
go0 :: Reg (Bit 1) <- makeDReg false
go1 :: Reg (Bit 1) <- makeDReg false
go4 :: Reg (Bit 1) <- makeDReg false
-- Warp id register, for each stage
warpId1 :: Reg (Bit SIMTLogWarps) <- makeReg dontCare
warpId4 :: Reg (Bit SIMTLogWarps) <- makeReg dontCare
-- Thread state, for each stage
state4 :: Reg SIMTThreadState <- makeReg dontCare
-- Active thread mask
activeMask2b :: Reg (Bit SIMTLanes) <- makeReg dontCare
activeMask4 :: Reg (Bit SIMTLanes) <- makeReg dontCare
-- Instruction register for each stage
instr4 :: Reg (Bit 32) <- makeReg dontCare
-- Is any thread in the current warp suspended?
isSusp4 :: Reg (Bit 1) <- makeReg dontCare
-- Insert warp back into warp queue at end of pipeline?
rescheduleWarp6 :: Reg (Bit 1) <- makeDReg false
-- Global exception register for entire core
excGlobal :: Reg (Bit 1) <- makeReg false
-- Program counter at point of exception
excPC :: Reg (Bit 32) <- makeReg dontCare
-- Per-lane exception register
excLocals :: [Reg (Bit 1)] <- replicateM SIMTLanes (makeReg false)
-- Kernel response queue (indicates to CPU when kernel has finished)
kernelRespQueue :: Queue SIMTResp <- makeShiftQueue 1
-- Track how many warps have terminated
completedWarps :: Reg (Bit SIMTLogWarps) <- makeReg 0
-- Track kernel success/failure
kernelSuccess :: Reg (Bit 1) <- makeReg true
-- Per-warp program counter capability registers, if shared PCC enabled
(pccSharedA, pccSharedB) :: (RAM (Bit SIMTLogWarps) CapPipe,
RAM (Bit SIMTLogWarps) CapPipe) <-
if enableCHERI && c.useSharedPCC
then makeQuadRAM
else return (nullRAM, nullRAM)
-- Basic stat counters
cycleCount :: Reg (Bit 32) <- makeReg 0
instrCount :: Reg (Bit 32) <- makeReg 0
-- Stat counter for scalarisable instructions (if scalar unit
-- disabled) or scalarised instructions (if scalar unit enabled)
scalarisableInstrCount :: Reg (Bit 32) <- makeReg 0
-- Count pipeline bubbles for perforance stats
retryCount :: Reg (Bit 32) <- makeReg 0
suspCount :: Reg (Bit 32)<- makeReg 0
scalarSuspCount :: Reg (Bit 32) <- makeReg 0
scalarAbortCount :: Reg (Bit 32) <- makeReg 0
-- Count DRAM accesses for performance stats
dramAccessCount :: Reg (Bit 32) <- makeReg 0
-- Count coalescing unit store buffer load hit/miss
sbLoadHitCount :: Reg (Bit 32) <- makeReg 0
sbLoadMissCount :: Reg (Bit 32) <- makeReg 0
sbCapLoadHitCount :: Reg (Bit 32) <- makeReg 0
sbCapLoadMissCount :: Reg (Bit 32) <- makeReg 0
-- Triggers from each execute unit to increment instruction count
incInstrCountRegs <- replicateM SIMTLanes (makeDReg false)
incScalarInstrCount <- makeDReg false
incRetryCount <- makeDReg false
incSuspCount <- makeDReg false
incScalarSuspCount <- makeDReg false
incScalarAbortCount <- makeDReg false
-- Indicates that current instruction is scalarisable
instrScalarisable5 <- makeReg false
-- Scalar unit warp queue
scalarQueue :: [Reg (Bit 1)] <- replicateM SIMTWarps (makeReg false)
-- Function to convert from 32-bit PC to instruction address
let toInstrAddr :: Bit 32 -> Bit t_logInstrs =
\pc -> truncateCast (slice @31 @2 pc)
-- For each pipeline stage, is it spilling a register for dynamic spilling?
spill0 :: Reg (Bit 1) <- makeDReg false
spill1 :: Reg (Bit 1) <- makeDReg false
spill4 :: Reg (Bit 1) <- makeDReg false
-- For each pipeline stage, is it spilling from int or cap reg file
spillFrom0 :: Reg (Bit 1) <- makeReg dontCare
spillFrom1 :: Reg (Bit 1) <- makeReg dontCare
spillFrom4 :: Reg (Bit 1) <- makeReg dontCare
-- Register to spill
spillReg4 :: Reg (Bit 5) <- makeReg dontCare
-- Spill successful?
spillSuccess6 :: Reg (Bit 1) <- makeDReg false
-- Vector register mask (for dynamic spilling)
vecMask2b :: Reg (Bit 32) <- makeReg dontCare
-- Maintain psuedo rolling average for register use
regUsage :: Vec 32 (Reg (Bit SIMTRegCountBits)) <- V.replicateM (makeReg 0)
-- Mask of registers to prioritise for spilling
spillMaskPref :: Reg (Bit 32) <- makeReg dontCare
-- Previous register spilled
spillMaskPrev :: Reg (Bit 32) <- makeReg 0
-- Tags of affine-scalarisable instructions
let affineTags = catMaybes [ c.scalarUnitAffineAdd
, c.scalarUnitAffineCMove
, c.scalarUnitAffineCIncOffset ]
-- Pipeline Initialisation
-- =======================
-- Register to trigger pipeline initialisation at some PC
startReg :: Reg (Option (Bit 32)) <- makeReg none
-- Warp id counter, to initialise PC of each thread
warpIdCounter :: Reg (Bit SIMTLogWarps) <- makeReg 0
-- Has initialisation completed?
initComplete :: Reg (Bit 1) <- makeReg false
always do
let start = startReg.val
-- When start register is valid, perform initialisation
when (start.valid .&. kernelRespQueue.notFull) do
-- Write PC to each thread of warp
let initState =
SIMTThreadState {
simtPC = start.val
, simtNestLevel = 0
, simtRetry = false
}
-- Initialise per-warp state
let initPCC = almightyCapMemVal -- TODO: constrain, or take as param
sequence_
[ do stateMem.store (warpIdCounter.val) initState
if enableCHERI
then pccMem.store (warpIdCounter.val) (upper initPCC)
else return ()
| (stateMem, pccMem) <- zip stateMemsA pccMemsA ]
-- Intialise PCC per warp
-- TODO: constrain, or take as param
pccSharedA.store warpIdCounter.val almightyCapPipeVal
when (warpIdCounter.val .==. 0) do
-- Register file initialisation
regFile.init
capRegFile.init
-- Reset various state
excGlobal <== false
sequence_ [e <== false | e <- excLocals]
setCount barrierCount 0
sequence_ [r <== false | r <- barrierBits]
sequence_ [r <== true | r <- warpQueue]
-- Finish initialisation and activate pipeline
if warpIdCounter.val .==. ones
then do
startReg <== none
initComplete <== true
warpIdCounter <== 0
else
warpIdCounter <== warpIdCounter.val + 1
always do
let initDone = andList
[ initComplete.val
, inv regFile.initInProgress
, inv capRegFile.initInProgress ]
when initDone do
initComplete <== false
-- Start pipeline
pipelineActive <== true
-- Reset counters
cycleCount <== 0
instrCount <== 0
scalarisableInstrCount <== 0
retryCount <== 0
suspCount <== 0
scalarSuspCount <== 0
scalarAbortCount <== 0
dramAccessCount <== 0
sbLoadHitCount <== 0
sbLoadMissCount <== 0
sbCapLoadHitCount <== 0
sbCapLoadMissCount <== 0
-- Stat counters
-- =============
when c.enableStatCounters do
always do
when (pipelineActive.val) do
-- Increment cycle count
cycleCount <== cycleCount.val + 1
-- Increment instruction count
let instrIncs :: [Bit 32] =
map zeroExtend (map (.val) incInstrCountRegs)
let instrInc = tree1 (\a b -> reg 0 (a+b)) instrIncs
let scalarInstrInc =
incScalarInstrCount.val ? (SIMTLanes, 0)
if c.useScalarUnit
then do
instrCount <== instrCount.val + instrInc + scalarInstrInc
scalarisableInstrCount <==
scalarisableInstrCount.val + scalarInstrInc
else do
instrCount <== instrCount.val + instrInc
scalarisableInstrCount <== scalarisableInstrCount.val +
(if delay false instrScalarisable5.val then instrInc else 0)
-- Pipeline bubbles
when incRetryCount.val do retryCount <== retryCount.val + 1
when incSuspCount.val do suspCount <== suspCount.val + 1
when c.useScalarUnit do
when incScalarSuspCount.val do
scalarSuspCount <== scalarSuspCount.val + 1
when incScalarAbortCount.val do
scalarAbortCount <== scalarAbortCount.val + 1
-- DRAM accesses
dramAccessCount <==
dramAccessCount.val +
zeroExtend inputs.simtDRAMStatSigs.dramLoadSig +
zeroExtend inputs.simtDRAMStatSigs.dramStoreSig
-- Store buffer hit rate
when inputs.simtCoalStats.incLoadHit do
if inputs.simtCoalStats.isCapMetaAccess
then sbCapLoadHitCount <== sbCapLoadHitCount.val + 1
else sbLoadHitCount <== sbLoadHitCount.val + 1
when inputs.simtCoalStats.incLoadMiss do
if inputs.simtCoalStats.isCapMetaAccess
then sbCapLoadMissCount <== sbCapLoadMissCount.val + 1
else sbLoadMissCount <== sbLoadMissCount.val + 1
-- ===============
-- Vector Pipeline
-- ===============
-- Stage 0: Warp Scheduling
-- ========================
-- Scheduler history
schedHistory :: Reg (Bit SIMTWarps) <- makeReg 0
-- Which warps contain at least one suspended thread?
warpSuspMask :: [Reg (Bit 1)] <- replicateM SIMTWarps (makeReg false)
-- Warp chosen by scheduler
chosenWarp :: Reg (Bit SIMTWarps) <- makeReg dontCare
-- For dynamic register spilling
--------------------------------
-- Regsiter spill mode
regSpillMode <- makeReg false
-- Spill from int reg file (otherwise spill from cap reg file)
regSpillModeIntOrCap <- makeReg 0
-- Warps currently in vector pipeline spilling a register
spillingWarps :: [Reg (Bit 1)] <- replicateM SIMTWarps (makeReg false)
-- Scheduler history for warps that are spilling a register
schedHistorySpill :: Reg (Bit SIMTWarps) <- makeReg 0
-- Warp chosen by scheduler to spill a register
chosenWarpSpill :: Reg (Bit SIMTWarps) <- makeReg dontCare
--------------------------------
-- Continous monitoring/buffering of signals needed by scheduler
always do
-- Calculate which warps contain a suspended thread
-- (delayed by one cycle)
sequence_
[ b <== orList (map (.val) bs)
| (b, bs) <- zip warpSuspMask (transpose suspBits) ]
-- Enable register spill mode when required
when enableSpill do
if enableCHERI && c.useSharedVectorScratchpad
then do
let totalRegs = regFile.numVecRegs + capRegFile.numVecRegs
let needSpill = SIMTRegFileSize - totalRegs .<. SIMTWarps
let doCapSpill = if enableCapSpill
then capRegFile.numVecRegs .>=.
SIMTSharedVecSpadCapThreshold
else false
regSpillMode <== if enableRegSpill || enableCapSpill
then needSpill else false
regSpillModeIntOrCap <== if doCapSpill then 1 else 0
else do
let needSpill :: forall s n. SIMTRegFile s n -> Bit 1
needSpill rf = rf.numVecRegsUnused .<. SIMTWarps
let needRegSpill =
if enableRegSpill then needSpill regFile else false
let needCapSpill =
if enableCapSpill then needSpill capRegFile else false
regSpillMode <== needRegSpill .||. needCapSpill
regSpillModeIntOrCap <== if needRegSpill then 0 else 1
-- Scheduler: 1st substage
always do
-- Bit mask of available warps
let avail :: Bit SIMTWarps = fromBitList
[ w.val .&&. inv s.val .&&.
(if enableSpill then inv e.val else true)
| (w, s, e) <- zip3 warpQueue warpSuspMask spillingWarps ]
-- Fair scheduler
let (newSchedHistory, chosen) = fairScheduler (schedHistory.val, avail)
chosenWarp <== chosen
-- Dynamic register spilling: bit mask of available warps
let spillAvail :: Bit SIMTWarps = fromBitList
[ orList [w.val, b.val] .&&. inv e.val .&&. inv susp.val
| (w, b, e, susp) <-
zip4 warpQueue barrierBits
spillingWarps warpSuspMask ]
-- Dynamic register spilling: fair scheduler
let (newSchedHistorySpill, chosenSpill) =
fairScheduler (schedHistorySpill.val, spillAvail)
when enableSpill do chosenWarpSpill <== chosenSpill
-- Trigger stage 1
when pipelineActive.val do
-- Do we need to spill registers for dynamic register spilling?
let spillMode = if enableSpill then regSpillMode.val else false
-- Select warp for register spill, or normal operation?
if spillMode
then do
when enableSpill do
sequence_
[ when c do r <== true
| (r, c) <- zip spillingWarps (toBitList chosenSpill) ]
when (spillAvail .!=. 0) do
schedHistorySpill <== newSchedHistorySpill
spill0 <== true
spillFrom0 <== regSpillModeIntOrCap.val
go0 <== true
else do
sequence_
[ when c do r <== false
| (r, c) <- zip warpQueue (toBitList chosen) ]
when (avail .!=. 0) do
schedHistory <== newSchedHistory
go0 <== true
-- Scheduler: 2nd substage
always do
when go0.val do
let warp =
if enableSpill
then spill0.val ? (chosenWarpSpill.val, chosenWarp.val)
else chosenWarp.val
let warpId = binaryEncode warp
-- Load state for next warp on each lane
forM_ stateMemsA \stateMem -> do
stateMem.load warpId
-- Load PCC for next warp on each lane
if enableCHERI && not c.useSharedPCC
then do
forM_ pccMemsA \pccMem -> do
pccMem.load warpId
else return ()
-- Buffer warp id for stage 1
warpId1 <== warpId
-- Trigger stage 1
go1 <== true
spill1 <== spill0.val
spillFrom1 <== spillFrom0.val
-- Stage 1: Active Thread Selection
-- ================================
-- For timing, we split this stage over several cycles
let stage1Substages = 2
always do
regFile.loadVecMask warpId1.val
capRegFile.loadVecMask warpId1.val
-- Active threads are those with the max nesting level
-- On a tie, favour instructions undergoing a retry
let maxOf a@(a_nest, a_retry, _)
b@(b_nest, b_retry, _) =
if (a_nest # a_retry) .>. (b_nest # b_retry) then a else b
let (_, _, leaderIdx) =
pipelinedTree1 (stage1Substages-1) maxOf
[ ( mem.out.simtNestLevel
, mem.out.simtRetry
, fromInteger i :: Bit SIMTLogLanes )
| (mem, i) <- zip stateMemsA [0..] ]
-- Wait for leader index to be computed
let states2_tmp =
[iterateN (stage1Substages-1) buffer (mem.out) | mem <- stateMemsA]
let pccs2_tmp =
[iterateN (stage1Substages-1) buffer (mem.out) | mem <- pccMemsA]
-- State and PCC of leader
let state2 = buffer (states2_tmp ! leaderIdx)
let pcc2 = buffer (pccs2_tmp ! leaderIdx)
-- Stat and PCC of all threads
let stateMemOuts2 = map buffer states2_tmp
let pccs2 = map buffer pccs2_tmp
-- Trigger stage 2
let warpId2 = iterateN stage1Substages buffer warpId1.val
let spill2 = iterateN stage1Substages (delay 0) spill1.val
let spillFrom2 = iterateN stage1Substages (delay 0) spillFrom1.val
let vecMask2 = iterateN (stage1Substages - 1) (delay 0) regFile.getVecMask
let capVecMask2 = iterateN (stage1Substages - 1) (delay 0) capRegFile.getVecMask
let go2 = iterateN stage1Substages (delay 0) go1.val
-- Stage 2: Instruction Fetch
-- ==========================
always do
-- Compute active thread mask
let activeList =
[ state2 === s .&&.
if enableCHERI && not (c.useSharedPCC)
then pcc2 .==. pcc
else true
| (s, pcc) <- zip stateMemOuts2 pccs2]
let activeMask :: Bit SIMTLanes = fromBitList activeList
if enableSpill
then activeMask2b <== spill2 ? (ones, activeMask)
else activeMask2b <== activeMask
-- Assert that at least one thread in the warp must be active
when go2 do
dynamicAssert (orList activeList)
"SIMT pipeline error: no active threads in warp"
-- Issue load to instruction memory
let pc = state2.simtPC
instrMemA.load (toInstrAddr pc)
-- Load per-warp PCC
when c.useSharedPCC do
pccSharedA.load warpId2
-- Get vector register mask for current warp
when enableSpill do
let chooseRF i c =
case (enableRegSpill, enableCapSpill) of
(False, False) -> dontCare
(True , False) -> i
(False, True) -> c
(True , True) -> spillFrom2 ? (c, i)
let vecMask = chooseRF vecMask2 capVecMask2
vecMask2b <== vecMask
when c.useLRUSpill do
let subset xs = [x | (x, i) <- zip xs [0..], i `mod` 2 == 1]
let usage = zipWith (\v c -> if v then c.val else ones)
(toBitList vecMask) (toList regUsage)
let min a b = if a .<. b then a else b
let regUsageMin = tree1 min (subset usage)
spillMaskPref <==
pack (V.map (\c -> c.val .<=. regUsageMin) regUsage)
when c.useRRSpill do
spillMaskPref <== reverseBits (reverseBits spillMaskPrev.val - 1)
-- Second instruction fetch stage
---------------------------------
-- Only used when dynamic reg spilling enabled
-- Used to determine which register to spill
-- Outputs from second fetch stage
fetchA3Reg :: Reg RegId <- makeReg dontCare
spillFail3Reg :: Reg (Bit 1) <- makeReg dontCare
spillA3Reg :: Reg RegId <- makeReg dontCare
always do
when enableSpill do
let srcRegA = srcA instrMemA.out
let srcRegB = srcB instrMemA.out
let dstReg = dst instrMemA.out
let spillMaskA0 = vecMask2b.val .&.
inv (binaryDecode srcRegA
.|. binaryDecode srcRegB
.|. binaryDecode dstReg)
let spillMaskA1 = spillMaskA0 .&. spillMaskPref.val
let spillMaskA =
if c.useLRUSpill || c.useRRSpill
then (spillMaskA1 .==. 0) ? (spillMaskA0, spillMaskA1)
else spillMaskA0
let firstSpillMaskA = firstHot spillMaskA
spillMaskPrev <== firstSpillMaskA
let spillA = binaryEncode firstSpillMaskA
spillA3Reg <== spillA
let spill2b = delay false spill2
fetchA3Reg <== spill2b ? (spillA, srcRegA)
spillFail3Reg <== if enableSpill
then (spill2b .&&. spillMaskA .==. 0) else false
-- State for stage 3
let stage2Delay :: forall a. Bits a => a -> a
stage2Delay x = if enableSpill then delay zero (delay zero x)
else delay zero x
let warpId3 = stage2Delay warpId2
let state3 = stage2Delay state2
let pcc3 = stage2Delay $
if c.useSharedPCC
then let cap = (setAddr pccSharedA.out state2.simtPC).value in
decodeCapPipe' (pack $ toMem cap) cap
else decodeCapMem (pcc2 # state2.simtPC)
let spill3 = stage2Delay spill2
let spillFrom3 = stage2Delay spillFrom2
let spillFail3 = if enableSpill then spillFail3Reg.val else false
let fetchA3 = if enableSpill then fetchA3Reg.val
else srcA instrMemA.out
let instr3 = if enableSpill then old instrMemA.out else instrMemA.out
let activeMask3 =
if enableSpill then old activeMask2b.val else activeMask2b.val
let (tagMap3, _) = matchMap False (c.decodeStage) instr3
let usesCapMetaDataA3 =
if enableCHERI && c.useSharedVectorScratchpad
then spill3 .||. orList
[ Map.findWithDefault false tag tagMap3
| tag <- c.usesCapA ]
else true
let go3 = stage2Delay go2
let usesCapMetaDataB3 =
if enableCHERI && (c.useSharedVectorScratchpad || c.shareCapSRFPort)
then go3 .&&. c.usesCapB (packTagMap tagMap3) instr3
else true
-- Stage 3: Operand Fetch
-- ======================
let pcc4 = delay dontCare pcc3
always do
when (go3 .&&. inv capRegFile.stall) do
-- Fetch operands from register file
regFile.loadA (warpId3, fetchA3)
regFile.loadB (warpId3, srcB instr3)
-- Fetch capability meta-data from register file
when usesCapMetaDataA3 do capRegFile.loadA (warpId3, fetchA3)
when usesCapMetaDataB3 do capRegFile.loadB (warpId3, srcB instr3)
-- Load eviction status of destination register
when enableSpill do
regFile.loadEvictedStatus (warpId3, dst instr3)
capRegFile.loadEvictedStatus (warpId3, dst instr3)
-- Is any thread in warp suspended?
-- (In future, consider only suspension bits of active threads)
let isSusp3 = orList [map (.val) regs ! warpId3 | regs <- suspBits]
isSusp4 <== isSusp3 .||. spillFail3
-- Check PCC
case c.checkPCCFunc of
-- CHERI disabled; no check required
Nothing -> return ()
-- Check PCC
Just checkPCC -> do
let table = checkPCC pcc3
let exception = orList [cond | (cond, _) <- table]
when go3 do
when exception do
head excLocals <== true
let trapCode = priorityIf table (excCapCode 0)
display "SIMT pipeline: PCC exception: code=" trapCode
-- Handle reg file stall
when enableCHERI do
when (go3 .&&. capRegFile.stall) do
if spill3
then (spillingWarps!warpId3) <== false
else (warpQueue!warpId3) <== true
-- Trigger stage 4
warpId4 <== warpId3
activeMask4 <== activeMask3
instr4 <== instr3
state4 <== state3
spill4 <== spill3 .&&. inv capRegFile.stall
spillFrom4 <== spillFrom3
spillReg4 <== spillA3Reg.val
go4 <== go3 .&&. inv capRegFile.stall
-- Stage 4: Operand Latch
-- ======================
-- Delay given signal by register file load latency
let loadDelay :: Bits a => a -> a
loadDelay inp = iterateN (loadLatency - 1) (delay zero) inp
-- Extra latch when shared vector spad in use
let extra :: Bits a => a -> a
extra inp = if enableCHERI && c.useSharedVectorScratchpad
then delay zero inp else inp
-- Decode instruction
let delayedInstr4 = loadDelay instr4.val
let (tagMap4, fieldMap4) = matchMap False (c.decodeStage) delayedInstr4
-- Stage 5 register operands
let rfAOut = V.map (slice @31 @0) regFile.outA
let rfBOut = V.map (slice @31 @0) regFile.outB
let vecRegA5 = old $ extra rfAOut
let vecRegB5 = old $ extra rfBOut
-- Stage 5 capability register operands
let getCapReg intReg capReg =
old $ decodeCapMem (extra (capReg # intReg))
let vecCapRegA5 = V.zipWith getCapReg rfAOut capRegFile.outA
let vecCapRegB5 = V.zipWith getCapReg rfBOut capRegFile.outB
let vecRawCapMetaRegA5 = V.map (old . extra) capRegFile.outA
-- Determine if field is available in current instruction
let isFieldInUse fld fldMap =
case Map.lookup fld fldMap of
Nothing -> false
Just opt -> opt.valid
let usesA = isFieldInUse "rs1" fieldMap4
let usesB = isFieldInUse "rs2" fieldMap4
let usesDest = isFieldInUse "rd" fieldMap4
let usesCapA = loadDelay (delay false usesCapMetaDataA3)
let usesCapB = loadDelay (delay false usesCapMetaDataB3)
let usesA5 = delay false $ extra usesA
let usesB5 = delay false $ extra usesB
let usesDest5 = delay false $ extra usesDest
-- Register unspilling (fetching)