Why is my parallel code even slower than without parallelism?-CodePudding

I followed Simon Marlow's book on parallel Haskell (Chapter 1) using rpar/rseq .

Below is the code (Solving the Squid Game bridge simulation):

{-# LANGUAGE FlexibleContexts #-}

import Control.DeepSeq (force)
import Control.Exception (evaluate)
import Control.Parallel.Strategies
import Data.Array.IO
  ( IOUArray,
    getAssocs,
    newListArray,
    readArray,
    writeArray,
  )
import Data.Functor ((<&>))
import System.Environment (getArgs)
import System.Random (randomRIO)

game ::
  Int -> -- number of steps
  Int -> -- number of glass at each step
  Int -> -- number of players
  IO Int -- return the number of survivors
game totalStep totalGlass = go 1 totalGlass
  where
    go currentStep currentGlass numSurvivors
      | numSurvivors == 0 || currentStep > totalStep = return numSurvivors
      | otherwise = do
        r <- randomRIO (1, currentGlass)
        if r == 1
          then go (currentStep   1) totalGlass numSurvivors
          else go currentStep (currentGlass - 1) (numSurvivors - 1)

simulate :: Int -> IO Int -> IO [(Int, Int)]
simulate n game =
  (newListArray (0, 16) (replicate 17 0) :: IO (IOUArray Int Int))
    >>= go 1
    >>= getAssocs
  where
    go i marr
      | i <= n = do
        r <- game
        readArray marr r >>= writeArray marr r . (  1)
        go (i   1) marr
      | otherwise = return marr

main1 :: IO ()
main1 = do
  [n, steps, glassNum, playNum] <- getArgs <&> Prelude.map read
  res <- simulate n (game steps glassNum playNum)
  mapM_ print res

main2 :: IO ()
main2 = do
  putStrLn "Running main2"
  [n, steps, glassNum, playNum] <- getArgs <&> Prelude.map read
  res <- runEval $ do
    r1 <- rpar $ simulate (div n 2) (game steps glassNum playNum) >>= evaluate . force
    r2 <- rpar $ simulate (div n 2) (game steps glassNum playNum) >>= evaluate . force
    rseq r1
    rseq r2
    return $
      (\l1 l2 -> zipWith (\e1 e2 -> (fst e1, snd e1   snd e2)) l1 l2)
        <$> r1
        <*> r2

  mapM_ print res

main = main2

For main2, I've compiled using:

ghc -O2 -threaded ./squid.hs

and run as:

./squid 10000000 18 2 16  RTS -N2

I can't understand why main1 is faster than main2 while main2 has parallelism in it.

Could anyone give me some comments on my code as to whether this is the correct use of parallelism?

Update: Here's the updated version (the new random is quite cumbersome to use):

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RankNTypes #-}

import Control.Monad.ST (ST, runST)
import Control.Parallel.Strategies (rpar, rseq, runEval)
import Data.Array.ST
  ( STUArray,
    getAssocs,
    newListArray,
    readArray,
    writeArray,
  )
import Data.Functor ((<&>))
import System.Environment (getArgs)
import System.Random (StdGen)
import System.Random.Stateful
  ( StdGen,
    applySTGen,
    mkStdGen,
    runSTGen,
    uniformR,
  )

game ::
  Int -> -- number of steps
  Int -> -- number of glass at each step
  Int -> -- number of players
  StdGen ->
  ST s (Int, StdGen) -- return the number of survivors
game ns ng = go 1 ng
  where
    go
      !cs -- current step number
      !cg -- current glass number
      !ns -- number of survivors
      !pg -- pure generator
        | ns == 0 || cs > ns = return (ns, pg)
        | otherwise = do
          let (r, g') = runSTGen pg (applySTGen (uniformR (1, cg)))
          if r == 1
            then go (cs   1) ng ns g'
            else go cs (cg - 1) (ns - 1) g'

simulate :: Int -> (forall s. StdGen -> ST s (Int, StdGen)) -> [(Int, Int)]
simulate n game =
  runST $
    (newListArray (0, 16) (replicate 17 0) :: ST s1 (STUArray s1 Int Int))
      >>= go 1 (mkStdGen n)
      >>= getAssocs
  where
    go !i !g !marr
      | i <= n = do
        (r, g') <- game g
        readArray marr r >>= writeArray marr r . (  1)
        go (i   1) g' marr
      | otherwise = return marr

main :: IO ()
main = do
  [n, steps, glassNum, playNum] <- getArgs <&> Prelude.map read
  let res = runEval $ do
        r1 <- rpar $ simulate (div n 2 - 1) (game steps glassNum playNum)
        r2 <- rpar $ simulate (div n 2   1) (game steps glassNum playNum)
        rseq r1
        rseq r2
        return $ zipWith (\e1 e2 -> (fst e1, snd e1   snd e2)) r1 r2
  mapM_ print res

Update 2:

Use pure code and the elapsed time is down to 7 seconds.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RankNTypes #-}

import Control.Monad.ST ( runST, ST )
import Control.Parallel ( par, pseq )
import Data.Array.ST
    ( getAssocs, newListArray, readArray, writeArray, STUArray )
import Data.Functor ((<&>))
import System.Environment (getArgs)
import System.Random (StdGen, uniformR, mkStdGen)
game ::
  Int -> -- number of total steps
  Int -> -- number of glass at each step
  Int -> -- number of players
  StdGen ->
  (Int, StdGen) -- return the number of survivors
game ts ng = go 1 ng
  where
    go
      !cs -- current step number
      !cg -- current glass number
      !ns -- number of survivors
      !pg -- pure generator
        | ns == 0 || cs > ts = (ns, pg)
        | otherwise = do
          let (r, g') = uniformR (1, cg) pg
          if r == 1
            then go (cs   1) ng ns g'
            else go cs (cg - 1) (ns - 1) g'

simulate :: Int -> (StdGen -> (Int, StdGen)) -> [(Int, Int)]
simulate n game =
  runST $
    (newListArray (0, 16) (replicate 17 0) :: ST s1 (STUArray s1 Int Int))
      >>= go 1 (mkStdGen n)
      >>= getAssocs
  where
    go !i !g !marr
      | i <= n = do
        let (r, g') = game g
        readArray marr r >>= writeArray marr r . (  1)
        go (i   1) g' marr
      | otherwise = return marr

main :: IO ()
main = do
  [n, steps, glassNum, playNum] <- getArgs <&> Prelude.map read

  let r1 = simulate (div n 2 - 1) (game steps glassNum playNum)
      r2 = simulate (div n 2   1) (game steps glassNum playNum)
      res = zipWith (\e1 e2 -> (fst e1, snd e1   snd e2)) r1 r2

      res' = par r1 (pseq r2 res)

  mapM_ print res'

CodePudding user response：

You aren't actually using any parallelism. You write

    r1 <- rpar $ simulate (div n 2) (game steps glassNum playNum) >>= evaluate . force

This sparks a thread to evaluate an IO action, not to run it. That's not useful.

Since your simulate is essentially pure, you should convert it from IO to ST s by swapping in the appropriate array types, etc. Then you can rpar (runST $ simulate ...) and actually do work in parallel. I don't think the force invocations are useful/appropriate in context; they'll free the arrays sooner, but at significant cost.