Skip to content

Commit 2f5b425

Browse files
authored
Haskell/solution_2: add multi-threading (PlummersSoftwareLLC#919)
1 parent 08962a4 commit 2f5b425

File tree

4 files changed

+73
-35
lines changed

4 files changed

+73
-35
lines changed

Diff for: PrimeHaskell/solution_2/CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
## 0.1.0.0 -- 2021-09-26
44

55
* Complete version including all five techniques and three "hybrid" compile options for 64, 128, and 256 bit registers.
6+
* Also includes Multi-threading for all of the above five techniques on four threads.

Diff for: PrimeHaskell/solution_2/Primes.hs

+44-20
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@ import PrimesNoLSR ( Technique(..), primesSoENoLSR )
77
import Data.Time.Clock.POSIX ( getPOSIXTime, POSIXTime )
88
import Data.Word ( Word8, Word64 )
99
import Data.Bits ( Bits((.|.), (.&.), shiftL, shiftR) )
10-
import Control.Concurrent ( threadDelay )
11-
import Control.Monad ( forM_, foldM_, foldM )
10+
import Data.Maybe (fromMaybe)
11+
import Control.Monad ( forM_, forM, foldM_, foldM )
1212
import Control.Monad.ST ( ST )
1313
import Data.Array ( Array )
1414
import Data.Array.Base ( MArray(newArray), STUArray(STUArray),
1515
castSTUArray, unsafeRead, unsafeWrite,
1616
UArray, listArray, assocs, unsafeAt )
1717
import Data.Array.ST ( runSTUArray )
18+
import Control.Concurrent ( threadDelay, setNumCapabilities, forkIO )
19+
import Control.Concurrent.MVar ( MVar, newEmptyMVar, putMVar, takeMVar )
1820

1921
type Prime = Word64
2022
type SieveBuffer = UArray Int Bool
@@ -28,6 +30,9 @@ cFORTIME = 5
2830
cCPUL1CACHE :: Int
2931
cCPUL1CACHE = 16384 -- in bytes, must be power of two
3032

33+
cNUMPROCS :: Int
34+
cNUMPROCS = 4
35+
3136
-- | Historical data for validating our results - the number of primes
3237
-- to be found under some limit, such as 168 primes under 1000
3338
primeCounts :: [(Prime, Int)]
@@ -42,7 +47,7 @@ primeCounts =
4247
]
4348

4449
cEXPECTED :: Int
45-
cEXPECTED = maybe 0 id $ lookup cLIMIT primeCounts
50+
cEXPECTED = fromMaybe 0 $ lookup cLIMIT primeCounts
4651

4752
cBITMASK :: UArray Int Word8 -- faster than bit shifting...
4853
cBITMASK = listArray (0, 7) [ 1, 2, 4, 8, 16, 32, 64, 128 ]
@@ -122,9 +127,8 @@ listPrimes :: SieveBuffer -> [Prime]
122127
listPrimes sb =
123128
sb `seq` 2 : [ fromIntegral (i + i + 3) | (i, False) <- assocs sb ]
124129

125-
benchMark :: Technique -> IO ()
126-
benchMark tec = do
127-
threadDelay 1000000
130+
singleTest :: Technique -> IO (Int, Bool)
131+
singleTest tec = do
128132
strttm <- getPOSIXTime
129133
let loop _ [] = error "Should never get here!!!"
130134
loop passes (hd : rst) = do
@@ -135,20 +139,40 @@ benchMark tec = do
135139
now <- cmpstsBuffer `seq` getPOSIXTime -- force immediate execution
136140
let duration = now - strttm
137141
if duration < cFORTIME then passes `seq` loop (passes + 1) rst else
138-
let count = length $ listPrimes cmpstsBuffer in
139-
if count == cEXPECTED then
140-
let label = case tec of
141-
BitTwiddle -> "bittwiddle"
142-
Stride8 -> "stride8"
143-
Stride8Block -> "stride8-block16K"
144-
Extreme -> "extreme"
145-
ExtremeHybrid -> "extreme-hybrid"
146-
in putStrLn $ "GordonBGood_" ++ label ++ ";"
147-
++ show passes ++ ";" ++ show (realToFrac duration)
148-
++ ";1;algorithm=base,faithful=yes,bits=1"
149-
else putStrLn $ "Invalid result: " ++ show count ++ " primes." ++ show passes
142+
return (passes, length (listPrimes cmpstsBuffer) == cEXPECTED)
150143
loop 0 (repeat cLIMIT)
151144

152-
main :: IO ()
153-
main = forM_ [ BitTwiddle .. ExtremeHybrid ] benchMark
145+
threadedTest :: Int -> Technique -> IO (Int, Bool)
146+
threadedTest thrds tec = do
147+
setNumCapabilities thrds
148+
mvrs <- forM [1 .. thrds] $ const newEmptyMVar
149+
forM_ mvrs $ \ mvr -> forkIO $ do answr <- singleTest tec
150+
putMVar mvr $! answr
151+
rslts <- forM mvrs $ \ mvr -> takeMVar mvr
152+
return (sum $ map fst rslts, all snd rslts)
153+
154+
benchMark :: Int -> Technique -> IO ()
155+
benchMark thrds tec = do
156+
threadDelay 1000000
157+
strttm <- getPOSIXTime
158+
(passes, chk) <- if thrds < 2 then singleTest tec
159+
else threadedTest cNUMPROCS tec
160+
now <- chk `seq` getPOSIXTime -- force immediate execution
161+
let duration = now - strttm
162+
if chk then
163+
let label = case tec of
164+
BitTwiddle -> "bittwiddle"
165+
Stride8 -> "stride8"
166+
Stride8Block -> "stride8-block16K"
167+
Extreme -> "extreme"
168+
ExtremeHybrid -> "extreme-hybrid"
169+
in putStrLn $ "GordonBGood_" ++ label ++ ";"
170+
++ show passes ++ ";" ++ show (realToFrac duration)
171+
++ ";" ++ show thrds
172+
++ ";algorithm=base,faithful=yes,bits=1"
173+
else putStrLn "Invalid result!!!"
154174

175+
main :: IO ()
176+
main = do
177+
forM_ [ BitTwiddle .. ExtremeHybrid ] $ benchMark 1 -- single threaded
178+
forM_ [ BitTwiddle .. ExtremeHybrid ] $ benchMark cNUMPROCS -- multi threaded

Diff for: PrimeHaskell/solution_2/Primes.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
#!/bin/bash
22

33
if lscpu | grep -qi "aarch64"; then
4-
ghc Primes
4+
ghc Primes -threaded
55
else
66
if lscpu | grep -qi "avx2"; then
7-
ghc Primes -DAVX2
7+
ghc Primes -DAVX2 -threaded
88
else
9-
ghc Primes -DAVX
9+
ghc Primes -DAVX -threaded
1010
fi
1111
fi

Diff for: PrimeHaskell/solution_2/README.md

+25-12
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
It is often spoken that functional languages such as Haskell must be slower than imperative ones; this implementation tries to dispell that notion.
99

10+
Although there is little point to a multi-threaded solution in showing which language is fastest for any of the languages as they will only show the effect of CPU throttling due to increased power usage for multiple cores and the effect of sharing resources, especially "Hyper-Threading" (HT)/""Simultaneous Multi Threading" (SMT) in sharing threads using common core execution unit resources and will be consistent in ratio to single threaded uses across languages, to be competitive a multi-threaded solution is provided. Since for the metric of work done per thread for HT/SMT threads when all available threads are used drops by almost a factor of two plus the thermal throttling factor, some implementations have used less than the maximum number of threads to gain an apparent advantage in the multi-threading leaderboard, with one precident example using 4 threads and some forcing 16 threads in order to gain an advantage in the main test machine which has 32 threads on 16 cores using HT/SMT. This seems objectionable as it tailors the test to this specific CPU and this implementation uses four threads, which should be available for all test machines. This will provide an advantage on the 16 core test machine in less thermal throttling and less sharing of compute engine resources, but it will be no more than the advantage of the other accepted implementation using four thread. As implied above, the multi-threading contest ruls should really be modified that all available threads must be used for a "maximum total work done" implementation.
11+
1012
The first three techniques used in this Haskell solution are implemented in an imperative style using `forM_` so that the core algorithm remains recognizable. Unlike the earlier solution, this solution does not use imported libraries to accomplish the task, so thus is `faithful to base`. The number representation is one bit per odd number.
1113

1214
The "stride8" techniques use a similar algorithm as the Rust "striped" algorithm but instead of changing the order of bits within the sieve buffer, leaves the order as normal and culls/marks them by "strides" in place, so thus is also `faithful base`. The actual loops are very simple and thus no separate storage implementation is used. The outer loop searches for the base prime values as required; The next inner loop level has a limit set so that it never runs more than eight times, then loops by just setting up the constant mask value and starting byte index to be used in the innermost actual marking loops. The boolean deliverable array is returned after masking off all values above the given range in the above two lines as those values may not have been processed and aren't desired in the output listing.
@@ -44,6 +46,7 @@ docker run --rm primes
4446

4547
## Output
4648

49+
The following outputs haven't been updated to show multi-threading results as the final Docker image shows that multi-theading is just directly proportional to the effect of thermal throttling this CPU from 3.6 GHz down to 3.2 GHz since it has no HT/SMT threads:
4750
- Intel SkyLake i5-6500, GHC Haskell version 8.10.7, no LLVM
4851

4952
```
@@ -84,19 +87,29 @@ Intel SkyLake i5-6500, GHC Haskell version 8.10.7, with LLVM (version 12) and 25
8487
GordonBGood_extreme-hybrid;39752;5.000006059;1;algorithm=base,faithful=yes,bits=1
8588
```
8689

87-
- Intel SkyLake i5-6500, docker, GHC Haskell version 8.8.4, with LLVM (version 11, which is likely a little slower), 256-bit registers
90+
- Intel SkyLake i5-6500, Docker, GHC Haskell version 8.8.4, with LLVM (version 11, which is likely a little slower), 256-bit registers
8891

8992
```
90-
Single-threaded
91-
┌───────┬────────────────┬──────────┬──────────────────────────────┬────────┬──────────┬─────────┬───────────┬──────────┬──────┬───────────────┐
92-
│ Index │ Implementation │ Solution │ Label │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
93-
├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
94-
│ 1 │ haskell │ 2 │ GordonBGood_extreme-hybrid │ 39659 │ 5.00000 │ 1 │ base │ yes │ 1 │ 7931.79792 │
95-
│ 2 │ haskell │ 2 │ GordonBGood_extreme │ 18140 │ 5.00015 │ 1 │ base │ yes │ 1 │ 3627.89364 │
96-
│ 3 │ haskell │ 2 │ GordonBGood_stride8-block16K │ 12276 │ 5.00021 │ 1 │ base │ yes │ 1 │ 2455.09589 │
97-
│ 4 │ haskell │ 2 │ GordonBGood_stride8 │ 11040 │ 5.00034 │ 1 │ base │ yes │ 1 │ 2207.85061 │
98-
│ 5 │ haskell │ 2 │ GordonBGood_bittwiddle │ 7237 │ 5.00014 │ 1 │ base │ yes │ 1 │ 1447.36032 │
99-
└───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
93+
Single-threaded
94+
┌───────┬────────────────┬──────────┬──────────────────────────────┬────────┬──────────┬─────────┬───────────┬──────────┬──────┬───────────────┐
95+
│ Index │ Implementation │ Solution │ Label │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
96+
├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
97+
│ 1 │ haskell │ 2 │ GordonBGood_extreme-hybrid │ 39149 │ 5.00124 │ 1 │ base │ yes │ 1 │ 7827.85860 │
98+
│ 2 │ haskell │ 2 │ GordonBGood_extreme │ 17873 │ 5.00143 │ 1 │ base │ yes │ 1 │ 3573.58140 │
99+
│ 3 │ haskell │ 2 │ GordonBGood_stride8-block16K │ 12989 │ 5.00153 │ 1 │ base │ yes │ 1 │ 2597.00306 │
100+
│ 4 │ haskell │ 2 │ GordonBGood_stride8 │ 10638 │ 5.00148 │ 1 │ base │ yes │ 1 │ 2126.97251 │
101+
│ 5 │ haskell │ 2 │ GordonBGood_bittwiddle │ 8407 │ 5.00148 │ 1 │ base │ yes │ 1 │ 1680.90359 │
102+
└───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
103+
Multi-threaded
104+
┌───────┬────────────────┬──────────┬──────────────────────────────┬────────┬──────────┬─────────┬───────────┬──────────┬──────┬───────────────┐
105+
│ Index │ Implementation │ Solution │ Label │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
106+
├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
107+
│ 1 │ haskell │ 2 │ GordonBGood_extreme-hybrid │ 143040 │ 5.00523 │ 4 │ base │ yes │ 1 │ 7144.52101 │
108+
│ 2 │ haskell │ 2 │ GordonBGood_extreme │ 56708 │ 5.00542 │ 4 │ base │ yes │ 1 │ 2832.32992 │
109+
│ 3 │ haskell │ 2 │ GordonBGood_stride8-block16K │ 48547 │ 5.00567 │ 4 │ base │ yes │ 1 │ 2424.59851 │
110+
│ 4 │ haskell │ 2 │ GordonBGood_stride8 │ 34046 │ 5.00520 │ 4 │ base │ yes │ 1 │ 1700.53002 │
111+
│ 5 │ haskell │ 2 │ GordonBGood_bittwiddle │ 26708 │ 5.02640 │ 4 │ base │ yes │ 1 │ 1328.38523 │
112+
└───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
100113
```
101114

102115
## Notes
@@ -3807,4 +3820,4 @@ As common to all efficient SoE implementations, almost all of the expended time
38073820

38083821
## Author
38093822

3810-
W. Gordon Goodsman (GordonBGood)
3823+
W. Gordon Goodsman (GordonBGood)

0 commit comments

Comments
 (0)