From 46749895bc8e0e3777bcf022b2cd758dca46e4e7 Mon Sep 17 00:00:00 2001 From: Jack Wines Date: Mon, 23 Dec 2024 08:29:31 -0500 Subject: [PATCH] parallel compression and decompression I've selected a chunk size of 8000 bytes rather arbitrarily, but it seems to work just fine. The bigger problem is that the simplified final fraction makes the running fraction in decompression's foldl more complicated as time goes on, making it around 50 times slower than compression. It takes just under an hour to decompress the Great Gatsby. --- compress.cabal | 7 +- src/Compress/Arithmetic.hs | 132 ++++++++++++++++++++++++++----------- src/Main.hs | 7 +- 3 files changed, 106 insertions(+), 40 deletions(-) diff --git a/compress.cabal b/compress.cabal index 8939f3c..2b7cb2c 100644 --- a/compress.cabal +++ b/compress.cabal @@ -53,7 +53,9 @@ executable compress optparse-generic, vector, nonempty-containers, - primes + witch, + monad-par, + monad-par-extras default-language: GHC2021 other-modules: @@ -63,8 +65,9 @@ executable compress Compress.PrefixTree Data.HuffmanTree Compress.Arithmetic + Data.WordyMap ghc-options: -threaded -fprof-auto -fprof-late - "-with-rtsopts=-p -hc" + "-with-rtsopts=-p -hc -B -N -qa" diff --git a/src/Compress/Arithmetic.hs b/src/Compress/Arithmetic.hs index bf0f6ec..5daae19 100644 --- a/src/Compress/Arithmetic.hs +++ b/src/Compress/Arithmetic.hs @@ -12,12 +12,13 @@ import Data.List (genericLength) import Data.List qualified as L import Data.Map.Strict qualified as M import Data.Maybe qualified as My -import Data.Numbers.Primes import Data.Ord import Data.Ratio import Data.Serialize qualified as C import GHC.Generics (Generic) import GHC.Natural (Natural) +import qualified Control.Monad.Par as P +import Control.Monad.Par (NFData) -- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes @@ -30,21 +31,32 @@ data WordMarkovStats = WordMarkovStats } deriving (Show, Eq, Ord, Generic, C.Serialize) -toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats -toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList) +toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats +toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats where - asList = M.toList $ m' - sum' = - sum - . map fromIntegral - . M.elems - $ m - m' = M.map ((% sum') . fromIntegral) m + + sizes = map ((% maxBound') . fromIntegral . snd) xs + + wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized + + asFracsOfTotalSum = map fst . tail . L.scanl' f (0, 0) $ sizes maxBound' :: Integer maxBound' = fromIntegral (maxBound :: a) -twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b) + withBumpedZeroSized + | numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum + | otherwise = asFracsOfTotalSum + + remainingSpace = 1 - sum asFracsOfTotalSum + + numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum + + f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum) + where + newFrac = currFrac * (1 - runningSum) + +twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)] twoByteMarkov xs = M.map sizeAsFraction . M.fromListWith (M.unionWith (+)) @@ -53,69 +65,115 @@ twoByteMarkov xs = . tail $ xs where - sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m + -- optimization, changes fractions so they represent the fraction + -- of remaining space in the list taken up instead of total space + sizeAsFraction :: M.Map k Integer -> [(k, b)] + sizeAsFraction m = + zip keys + . map discretizeFraction + . fractionOfRemainingSums + $ counts where - sum' = sum . M.elems $ m + fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs - toInteger :: (Integral a) => a -> Integer - toInteger = fromIntegral + asList = L.sortOn (negate . snd) . M.assocs $ m + keys = map fst asList + + counts = map snd asList + + discretizeFraction :: Rational -> b + discretizeFraction = floor . fromRational . (* maxBound') + + maxBound' :: Ratio Integer maxBound' = fromIntegral (maxBound :: b) data Compressed a = Compressed - { markovs :: M.Map a (M.Map a (Word8)), + { markovs :: M.Map a [(a, Word8)], + chunks :: [Chunk a] + } + deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData) + +data Chunk a = Chunk { location :: Ratio Integer, start :: a, length :: Int - } - deriving (Eq, Ord, Show, Generic, C.Serialize) + } deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData) decompress :: forall a. - (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => + (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) => Compressed a -> [a] -decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start) +decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks where - decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) - decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal) - where - (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev) rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size)) + decompressChunk :: Chunk a -> [a] + decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start) + where + decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) + decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal) + where + ansLoc :: Ratio Integer + newVal :: a + newSize :: Ratio Integer + (!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev) + sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8] sanityCheck = map (sum . M.elems) . M.elems +chunk chunkSize = chunk' + where + chunk' [] = [] + chunk' xs = xs' : chunk' xs'' + where + (xs', xs'') = splitAt chunkSize xs + +chunkLength = 8000 + compress :: forall a. - (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => + (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) => [a] -> Compressed a -compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress) +compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressChunk $ unCompressedChunks where + + unCompressedChunks = chunk chunkLength toCompress + twoByteMarkovs = twoByteMarkov toCompress rings = M.map toRing twoByteMarkovs - pairs = zip toCompress . tail $ toCompress + compressChunk toCompress = Chunk { + location = shortestLocation endStats, + start = head toCompress, + length = L.length toCompress + } + where - shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size) + pairs = zip toCompress . tail $ toCompress - endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs + shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size) - addWordMarkovStats - (WordMarkovStats {location = prevLoc, size = prevSize}) - (WordMarkovStats {location = nextLoc, size = nextSize}) = - WordMarkovStats - { location = prevLoc + (prevSize * nextLoc), - size = prevSize * nextSize - } + endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs - statsFor (x0, x1) = (rings M.! x0) M.! x1 + statsFor (x0, x1) = (rings M.! x0) M.! x1 + addWordMarkovStats + (WordMarkovStats {location = prevLoc, size = prevSize}) + (WordMarkovStats {location = nextLoc, size = nextSize}) = + WordMarkovStats + { location = prevLoc + (prevSize * nextLoc), + size = prevSize * nextSize + } + + +pyramidFold :: (a -> a -> a) -> [a] -> a pyramidFold f = pyramid where pyramid [x] = x diff --git a/src/Main.hs b/src/Main.hs index a019f57..cbb5cf0 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -52,7 +52,12 @@ main :: IO () main = do -- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy" f <- BS.getContents - BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f + let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f + let compressedUnencoded :: A.Compressed Word8 = (A.compress fAsWords) + let compressed = C.encode compressedUnencoded + print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f)) + print ("works?", (A.decompress compressedUnencoded) == fAsWords) + -- let f = "hello tehre" -- f <- BS.readFile "pg64317.txt" -- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f