Haskell/solution_2: add multi-threading (PlummersSoftwareLLC#919)

GordonBGood · web-flow · commit 2f5b425b741f · 2023-05-08T01:59:58.000+02:00
diff --git a/PrimeHaskell/solution_2/CHANGELOG.md b/PrimeHaskell/solution_2/CHANGELOG.md
@@ -3,3 +3,4 @@
 ## 0.1.0.0 -- 2021-09-26
 
 * Complete version including all five techniques and three "hybrid" compile options for 64, 128, and 256 bit registers.
+* Also includes Multi-threading for all of the above five techniques on four threads.
diff --git a/PrimeHaskell/solution_2/Primes.hs b/PrimeHaskell/solution_2/Primes.hs
@@ -7,14 +7,16 @@ import PrimesNoLSR ( Technique(..), primesSoENoLSR )
 import Data.Time.Clock.POSIX ( getPOSIXTime, POSIXTime )
 import Data.Word ( Word8, Word64 )
 import Data.Bits ( Bits((.|.), (.&.), shiftL, shiftR) )
-import Control.Concurrent ( threadDelay )
-import Control.Monad ( forM_, foldM_, foldM )
+import Data.Maybe (fromMaybe)
+import Control.Monad ( forM_, forM, foldM_, foldM )
 import Control.Monad.ST ( ST )
 import Data.Array ( Array )
 import Data.Array.Base ( MArray(newArray), STUArray(STUArray),
                          castSTUArray, unsafeRead, unsafeWrite,
                          UArray, listArray, assocs, unsafeAt )
 import Data.Array.ST ( runSTUArray )
+import Control.Concurrent ( threadDelay, setNumCapabilities, forkIO )
+import Control.Concurrent.MVar ( MVar, newEmptyMVar, putMVar, takeMVar )
 
 type Prime = Word64
 type SieveBuffer = UArray Int Bool
@@ -28,6 +30,9 @@ cFORTIME = 5
 cCPUL1CACHE :: Int 
 cCPUL1CACHE = 16384 -- in bytes, must be power of two
 
+cNUMPROCS :: Int
+cNUMPROCS = 4
+
 -- | Historical data for validating our results - the number of primes
 -- to be found under some limit, such as 168 primes under 1000
 primeCounts :: [(Prime, Int)]
@@ -42,7 +47,7 @@ primeCounts =
   ]
 
 cEXPECTED :: Int
-cEXPECTED = maybe 0 id $ lookup cLIMIT primeCounts
+cEXPECTED = fromMaybe 0 $ lookup cLIMIT primeCounts
 
 cBITMASK :: UArray Int Word8 -- faster than bit shifting...
 cBITMASK = listArray (0, 7) [ 1, 2, 4, 8, 16, 32, 64, 128 ]
@@ -122,9 +127,8 @@ listPrimes :: SieveBuffer -> [Prime]
 listPrimes sb =
    sb `seq` 2 : [ fromIntegral (i + i + 3) | (i, False) <- assocs sb ]
 
-benchMark :: Technique -> IO ()
-benchMark tec = do
-  threadDelay 1000000
+singleTest :: Technique -> IO (Int, Bool)
+singleTest tec = do
   strttm <- getPOSIXTime
   let loop _ [] = error "Should never get here!!!"
       loop passes (hd : rst) = do
@@ -135,20 +139,40 @@ benchMark tec = do
         now <- cmpstsBuffer `seq` getPOSIXTime -- force immediate execution
         let duration = now - strttm
         if duration < cFORTIME then passes `seq` loop (passes + 1) rst else
-          let count = length $ listPrimes cmpstsBuffer in
-          if count == cEXPECTED then
-            let label = case tec of
-                          BitTwiddle -> "bittwiddle"
-                          Stride8 -> "stride8"
-                          Stride8Block -> "stride8-block16K"
-                          Extreme -> "extreme"
-                          ExtremeHybrid -> "extreme-hybrid"
-            in putStrLn $ "GordonBGood_" ++ label ++ ";"
-                        ++ show passes ++ ";" ++ show (realToFrac duration)
-                        ++ ";1;algorithm=base,faithful=yes,bits=1"
-          else putStrLn $ "Invalid result:  " ++ show count ++ " primes." ++ show passes
+          return (passes, length (listPrimes cmpstsBuffer) == cEXPECTED)
   loop 0 (repeat cLIMIT)
 
-main :: IO ()
-main = forM_ [ BitTwiddle .. ExtremeHybrid ] benchMark
+threadedTest :: Int -> Technique -> IO (Int, Bool)
+threadedTest thrds tec = do
+  setNumCapabilities thrds
+  mvrs <- forM [1 .. thrds] $ const newEmptyMVar
+  forM_ mvrs $ \ mvr -> forkIO $ do answr <- singleTest tec
+                                    putMVar mvr $! answr
+  rslts <- forM mvrs $ \ mvr -> takeMVar mvr
+  return (sum $ map fst rslts, all snd rslts)
+
+benchMark :: Int -> Technique -> IO ()
+benchMark thrds tec = do
+  threadDelay 1000000
+  strttm <- getPOSIXTime
+  (passes, chk) <- if thrds < 2 then singleTest tec
+                   else threadedTest cNUMPROCS tec
+  now <- chk `seq` getPOSIXTime -- force immediate execution
+  let duration = now - strttm
+  if chk then
+    let label = case tec of
+                  BitTwiddle -> "bittwiddle"
+                  Stride8 -> "stride8"
+                  Stride8Block -> "stride8-block16K"
+                  Extreme -> "extreme"
+                  ExtremeHybrid -> "extreme-hybrid"
+    in putStrLn $ "GordonBGood_" ++ label ++ ";"
+                ++ show passes ++ ";" ++ show (realToFrac duration)
+                ++ ";" ++ show thrds
+                ++ ";algorithm=base,faithful=yes,bits=1"
+  else putStrLn "Invalid result!!!"
 
+main :: IO ()
+main = do
+  forM_ [ BitTwiddle .. ExtremeHybrid ] $ benchMark 1 -- single threaded
+  forM_ [ BitTwiddle .. ExtremeHybrid ] $ benchMark cNUMPROCS -- multi threaded
diff --git a/PrimeHaskell/solution_2/Primes.sh b/PrimeHaskell/solution_2/Primes.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
 if lscpu | grep -qi "aarch64"; then
-  ghc Primes
+  ghc Primes -threaded
 else
   if lscpu | grep -qi "avx2"; then
-    ghc Primes -DAVX2
+    ghc Primes -DAVX2 -threaded
   else
-    ghc Primes -DAVX
+    ghc Primes -DAVX -threaded
   fi
 fi
diff --git a/PrimeHaskell/solution_2/README.md b/PrimeHaskell/solution_2/README.md
@@ -7,6 +7,8 @@
 
 It is often spoken that functional languages such as Haskell must be slower than imperative ones; this implementation tries to dispell that notion.
 
+Although there is little point to a multi-threaded solution in showing which language is fastest for any of the languages as they will only show the effect of CPU throttling due to increased power usage for multiple cores and the effect of sharing resources, especially "Hyper-Threading" (HT)/""Simultaneous Multi Threading" (SMT) in sharing threads using common core execution unit resources and will be consistent in ratio to single threaded uses across languages, to be competitive a multi-threaded solution is provided.  Since for the metric of work done per thread for HT/SMT threads when all available threads are used drops by almost a factor of two plus the thermal throttling factor, some implementations have used less than the maximum number of threads to gain an apparent advantage in the multi-threading leaderboard, with one precident example using 4 threads and some forcing 16 threads in order to gain an advantage in the main test machine which has 32 threads on 16 cores using HT/SMT.  This seems objectionable as it tailors the test to this specific CPU and this implementation uses four threads, which should be available for all test machines.  This will provide an advantage on the 16 core test machine in less thermal throttling and less sharing of compute engine resources, but it will be no more than the advantage of the other accepted implementation using four thread.  As implied above, the multi-threading contest ruls should really be modified that all available threads must be used for a "maximum total work done" implementation.
+
 The first three techniques used in this Haskell solution are implemented in an imperative style using `forM_` so that the core algorithm remains recognizable.  Unlike the earlier solution, this solution does not use imported libraries to accomplish the task, so thus is `faithful to base`.  The number representation is one bit per odd number.
 
 The "stride8" techniques use a similar algorithm as the Rust "striped" algorithm but instead of changing the order of bits within the sieve buffer, leaves the order as normal and culls/marks them by "strides" in place, so thus is also `faithful base`.  The actual loops are very simple and thus no separate storage implementation is used.  The outer loop searches for the base prime values as required; The next inner loop level has a limit set so that it never runs more than eight times, then loops by just setting up the constant mask value and starting byte index to be used in the innermost actual marking loops.  The boolean deliverable array is returned after masking off all values above the given range in the above two lines as those values may not have been processed and aren't desired in the output listing.
@@ -44,6 +46,7 @@ docker run --rm primes
 
 ## Output
 
+The following outputs haven't been updated to show multi-threading results as the final Docker image shows that multi-theading is just directly proportional to the effect of thermal throttling this CPU from 3.6 GHz down to 3.2 GHz since it has no HT/SMT threads:
 - Intel SkyLake i5-6500, GHC Haskell version 8.10.7, no LLVM
 
   ```
@@ -84,19 +87,29 @@ Intel SkyLake i5-6500, GHC Haskell version 8.10.7, with LLVM (version 12) and 25
   GordonBGood_extreme-hybrid;39752;5.000006059;1;algorithm=base,faithful=yes,bits=1
   ```
 
-- Intel SkyLake i5-6500, docker, GHC Haskell version 8.8.4, with LLVM (version 11, which is likely a little slower), 256-bit registers
+- Intel SkyLake i5-6500, Docker, GHC Haskell version 8.8.4, with LLVM (version 11, which is likely a little slower), 256-bit registers
 
   ```
-                                                                  Single-threaded                                                                 
-  ┌───────┬────────────────┬──────────┬──────────────────────────────┬────────┬──────────┬─────────┬───────────┬──────────┬──────┬───────────────┐
-  │ Index │ Implementation │ Solution │ Label                        │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
-  ├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
-  │   1   │ haskell        │ 2        │ GordonBGood_extreme-hybrid   │ 39659  │ 5.00000  │    1    │   base    │   yes    │ 1    │  7931.79792   │
-  │   2   │ haskell        │ 2        │ GordonBGood_extreme          │ 18140  │ 5.00015  │    1    │   base    │   yes    │ 1    │  3627.89364   │
-  │   3   │ haskell        │ 2        │ GordonBGood_stride8-block16K │ 12276  │ 5.00021  │    1    │   base    │   yes    │ 1    │  2455.09589   │
-  │   4   │ haskell        │ 2        │ GordonBGood_stride8          │ 11040  │ 5.00034  │    1    │   base    │   yes    │ 1    │  2207.85061   │
-  │   5   │ haskell        │ 2        │ GordonBGood_bittwiddle       │  7237  │ 5.00014  │    1    │   base    │   yes    │ 1    │  1447.36032   │
-  └───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
+                                                                 Single-threaded                                                                 
+┌───────┬────────────────┬──────────┬──────────────────────────────┬────────┬──────────┬─────────┬───────────┬──────────┬──────┬───────────────┐
+│ Index │ Implementation │ Solution │ Label                        │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
+├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
+│   1   │ haskell        │ 2        │ GordonBGood_extreme-hybrid   │ 39149  │ 5.00124  │    1    │   base    │   yes    │ 1    │  7827.85860   │
+│   2   │ haskell        │ 2        │ GordonBGood_extreme          │ 17873  │ 5.00143  │    1    │   base    │   yes    │ 1    │  3573.58140   │
+│   3   │ haskell        │ 2        │ GordonBGood_stride8-block16K │ 12989  │ 5.00153  │    1    │   base    │   yes    │ 1    │  2597.00306   │
+│   4   │ haskell        │ 2        │ GordonBGood_stride8          │ 10638  │ 5.00148  │    1    │   base    │   yes    │ 1    │  2126.97251   │
+│   5   │ haskell        │ 2        │ GordonBGood_bittwiddle       │  8407  │ 5.00148  │    1    │   base    │   yes    │ 1    │  1680.90359   │
+└───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
+                                                                 Multi-threaded                                                                 
+┌───────┬────────────────┬──────────┬──────────────────────────────┬────────┬──────────┬─────────┬───────────┬──────────┬──────┬───────────────┐
+│ Index │ Implementation │ Solution │ Label                        │ Passes │ Duration │ Threads │ Algorithm │ Faithful │ Bits │ Passes/Second │
+├───────┼────────────────┼──────────┼──────────────────────────────┼────────┼──────────┼─────────┼───────────┼──────────┼──────┼───────────────┤
+│   1   │ haskell        │ 2        │ GordonBGood_extreme-hybrid   │ 143040 │ 5.00523  │    4    │   base    │   yes    │ 1    │  7144.52101   │
+│   2   │ haskell        │ 2        │ GordonBGood_extreme          │ 56708  │ 5.00542  │    4    │   base    │   yes    │ 1    │  2832.32992   │
+│   3   │ haskell        │ 2        │ GordonBGood_stride8-block16K │ 48547  │ 5.00567  │    4    │   base    │   yes    │ 1    │  2424.59851   │
+│   4   │ haskell        │ 2        │ GordonBGood_stride8          │ 34046  │ 5.00520  │    4    │   base    │   yes    │ 1    │  1700.53002   │
+│   5   │ haskell        │ 2        │ GordonBGood_bittwiddle       │ 26708  │ 5.02640  │    4    │   base    │   yes    │ 1    │  1328.38523   │
+└───────┴────────────────┴──────────┴──────────────────────────────┴────────┴──────────┴─────────┴───────────┴──────────┴──────┴───────────────┘
   ```
 
 ## Notes
@@ -3807,4 +3820,4 @@ As common to all efficient SoE implementations, almost all of the expended time
 
 ## Author
 
-W. Gordon Goodsman (GordonBGood)
+W. Gordon Goodsman (GordonBGood)

Original file line number	Diff line number	Diff line change
`@@ -3,3 +3,4 @@`
`3`	`3`	`## 0.1.0.0 -- 2021-09-26`
`4`	`4`
`5`	`5`	`* Complete version including all five techniques and three "hybrid" compile options for 64, 128, and 256 bit registers.`
	`6`	`+* Also includes Multi-threading for all of the above five techniques on four threads.`