parallel compression and decompression

I've selected a chunk size of 8000 bytes rather arbitrarily, but it seems to work just fine. The bigger problem is that the simplified final fraction makes the running fraction in decompression's foldl more complicated as time goes on, making it around 50 times slower than compression. It takes just under an hour to decompress the Great Gatsby.
2024-12-23 08:29:31 -05:00 · 2024-12-23 08:29:31 -05:00 · 46749895bc
commit 46749895bc
parent 791fff6107
3 changed files with 106 additions and 40 deletions
--- a/compress.cabal
+++ b/compress.cabal
@ -53,7 +53,9 @@ executable compress
    optparse-generic,
    vector,
    nonempty-containers,
-    primes
+    witch,
+    monad-par,
+    monad-par-extras
  default-language:
    GHC2021
  other-modules:
@ -63,8 +65,9 @@ executable compress
     Compress.PrefixTree
     Data.HuffmanTree
     Compress.Arithmetic
+     Data.WordyMap
  ghc-options:
    -threaded
    -fprof-auto
    -fprof-late
-    "-with-rtsopts=-p -hc"
+    "-with-rtsopts=-p -hc -B -N -qa"
--- a/src/Compress/Arithmetic.hs
+++ b/src/Compress/Arithmetic.hs
@ -12,12 +12,13 @@ import Data.List (genericLength)
 import Data.List qualified as L
 import Data.Map.Strict qualified as M
 import Data.Maybe qualified as My
-import Data.Numbers.Primes
 import Data.Ord
 import Data.Ratio
 import Data.Serialize qualified as C
 import GHC.Generics (Generic)
 import GHC.Natural (Natural)
+import qualified Control.Monad.Par as P
+import Control.Monad.Par (NFData)

 -- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes

@ -30,21 +31,32 @@ data WordMarkovStats = WordMarkovStats
  }
  deriving (Show, Eq, Ord, Generic, C.Serialize)

-toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats
-toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList)
+toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
+toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
  where
-    asList = M.toList $ m'
-    sum' =
-      sum
-        . map fromIntegral
-        . M.elems
-        $ m
-    m' = M.map ((% sum') . fromIntegral) m
+
+    sizes = map ((% maxBound') . fromIntegral . snd) xs
+
+    wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
+
+    asFracsOfTotalSum = map fst . tail . L.scanl' f (0, 0) $ sizes

    maxBound' :: Integer
    maxBound' = fromIntegral (maxBound :: a)

-twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b)
+    withBumpedZeroSized
+      | numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum
+      | otherwise = asFracsOfTotalSum
+
+    remainingSpace = 1 - sum asFracsOfTotalSum
+
+    numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
+
+    f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
+      where
+        newFrac = currFrac * (1 - runningSum)
+
+twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)]
 twoByteMarkov xs =
  M.map sizeAsFraction
    . M.fromListWith (M.unionWith (+))
@ -53,59 +65,105 @@ twoByteMarkov xs =
    . tail
    $ xs
  where
-    sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m
+    -- optimization, changes fractions so they represent the fraction
+    -- of remaining space in the list taken up instead of total space
+    sizeAsFraction :: M.Map k Integer -> [(k, b)]
+    sizeAsFraction m =
+        zip keys
+        . map discretizeFraction
+        . fractionOfRemainingSums
+        $ counts
      where
-        sum' = sum . M.elems $ m
+        fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs

-    toInteger :: (Integral a) => a -> Integer
-    toInteger = fromIntegral
+        asList = L.sortOn (negate . snd) . M.assocs $ m

+        keys = map fst asList
+
+        counts = map snd asList
+
+    discretizeFraction :: Rational -> b
+    discretizeFraction = floor . fromRational . (* maxBound')
+
+    maxBound' :: Ratio Integer
    maxBound' = fromIntegral (maxBound :: b)

 data Compressed a = Compressed
-  { markovs :: M.Map a (M.Map a (Word8)),
+  { markovs :: M.Map a [(a, Word8)],
+    chunks :: [Chunk a]
+  }
+  deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
+
+data Chunk a = Chunk {
    location :: Ratio Integer,
    start :: a,
    length :: Int
-  }
-  deriving (Eq, Ord, Show, Generic, C.Serialize)
+                     } deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)

 decompress ::
  forall a.
-  (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
+  (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
  Compressed a ->
  [a]
-decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start)
+decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
  where
-    decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
-    decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal)
-      where
-        (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)

    rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs

    toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))

+    decompressChunk :: Chunk a -> [a]
+    decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
+      where
+        decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
+        decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
+          where
+            ansLoc :: Ratio Integer
+            newVal :: a
+            newSize :: Ratio Integer
+            (!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
+
 sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
 sanityCheck = map (sum . M.elems) . M.elems

+chunk chunkSize = chunk'
+  where
+    chunk' [] = []
+    chunk' xs = xs' : chunk' xs''
+      where
+        (xs', xs'') = splitAt chunkSize xs
+
+chunkLength = 8000
+
 compress ::
  forall a.
-  (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
+  (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
  [a] ->
  Compressed a
-compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress)
+compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressChunk $ unCompressedChunks
  where
+
+    unCompressedChunks = chunk chunkLength toCompress
+
    twoByteMarkovs = twoByteMarkov toCompress

    rings = M.map toRing twoByteMarkovs

+    compressChunk toCompress = Chunk {
+      location = shortestLocation endStats,
+      start = head toCompress,
+      length = L.length toCompress
+                                     }
+      where
+
        pairs = zip toCompress . tail $ toCompress

        shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)

        endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs

+        statsFor (x0, x1) = (rings M.! x0) M.! x1
+
        addWordMarkovStats
          (WordMarkovStats {location = prevLoc, size = prevSize})
          (WordMarkovStats {location = nextLoc, size = nextSize}) =
@ -114,8 +172,8 @@ compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (hea
                size = prevSize * nextSize
              }

-    statsFor (x0, x1) = (rings M.! x0) M.! x1

+pyramidFold :: (a -> a -> a) -> [a] -> a
 pyramidFold f = pyramid
  where
    pyramid [x] = x
--- a/src/Main.hs
+++ b/src/Main.hs
@ -52,7 +52,12 @@ main :: IO ()
 main = do
  -- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
  f <- BS.getContents
-  BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
+  let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
+  let compressedUnencoded :: A.Compressed Word8 = (A.compress fAsWords)
+  let compressed = C.encode compressedUnencoded
+  print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
+  print ("works?", (A.decompress compressedUnencoded) == fAsWords)
+
  -- let f = "hello tehre"
  -- f <- BS.readFile "pg64317.txt"
  -- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f