parallel compression and decompression

I've selected a chunk size of 8000 bytes rather arbitrarily, but it
seems to work just fine. The bigger problem is that the simplified
final fraction makes the running fraction in decompression's foldl
more complicated as time goes on, making it around 50 times slower
than compression. It takes just under an hour to decompress the
Great Gatsby.
This commit is contained in:
Jack Wines 2024-12-23 08:29:31 -05:00
parent 791fff6107
commit 46749895bc
Signed by: Jack
SSH key fingerprint: SHA256:AaP2Hr/e3mEjeY+s9XJmQqAesqEms8ENRhwRkpO0WUk
3 changed files with 106 additions and 40 deletions

View file

@ -53,7 +53,9 @@ executable compress
optparse-generic, optparse-generic,
vector, vector,
nonempty-containers, nonempty-containers,
primes witch,
monad-par,
monad-par-extras
default-language: default-language:
GHC2021 GHC2021
other-modules: other-modules:
@ -63,8 +65,9 @@ executable compress
Compress.PrefixTree Compress.PrefixTree
Data.HuffmanTree Data.HuffmanTree
Compress.Arithmetic Compress.Arithmetic
Data.WordyMap
ghc-options: ghc-options:
-threaded -threaded
-fprof-auto -fprof-auto
-fprof-late -fprof-late
"-with-rtsopts=-p -hc" "-with-rtsopts=-p -hc -B -N -qa"

View file

@ -12,12 +12,13 @@ import Data.List (genericLength)
import Data.List qualified as L import Data.List qualified as L
import Data.Map.Strict qualified as M import Data.Map.Strict qualified as M
import Data.Maybe qualified as My import Data.Maybe qualified as My
import Data.Numbers.Primes
import Data.Ord import Data.Ord
import Data.Ratio import Data.Ratio
import Data.Serialize qualified as C import Data.Serialize qualified as C
import GHC.Generics (Generic) import GHC.Generics (Generic)
import GHC.Natural (Natural) import GHC.Natural (Natural)
import qualified Control.Monad.Par as P
import Control.Monad.Par (NFData)
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes -- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
@ -30,21 +31,32 @@ data WordMarkovStats = WordMarkovStats
} }
deriving (Show, Eq, Ord, Generic, C.Serialize) deriving (Show, Eq, Ord, Generic, C.Serialize)
toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList) toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
where where
asList = M.toList $ m'
sum' = sizes = map ((% maxBound') . fromIntegral . snd) xs
sum
. map fromIntegral wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
. M.elems
$ m asFracsOfTotalSum = map fst . tail . L.scanl' f (0, 0) $ sizes
m' = M.map ((% sum') . fromIntegral) m
maxBound' :: Integer maxBound' :: Integer
maxBound' = fromIntegral (maxBound :: a) maxBound' = fromIntegral (maxBound :: a)
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b) withBumpedZeroSized
| numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum
| otherwise = asFracsOfTotalSum
remainingSpace = 1 - sum asFracsOfTotalSum
numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
where
newFrac = currFrac * (1 - runningSum)
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)]
twoByteMarkov xs = twoByteMarkov xs =
M.map sizeAsFraction M.map sizeAsFraction
. M.fromListWith (M.unionWith (+)) . M.fromListWith (M.unionWith (+))
@ -53,69 +65,115 @@ twoByteMarkov xs =
. tail . tail
$ xs $ xs
where where
sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m -- optimization, changes fractions so they represent the fraction
-- of remaining space in the list taken up instead of total space
sizeAsFraction :: M.Map k Integer -> [(k, b)]
sizeAsFraction m =
zip keys
. map discretizeFraction
. fractionOfRemainingSums
$ counts
where where
sum' = sum . M.elems $ m fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs
toInteger :: (Integral a) => a -> Integer asList = L.sortOn (negate . snd) . M.assocs $ m
toInteger = fromIntegral
keys = map fst asList
counts = map snd asList
discretizeFraction :: Rational -> b
discretizeFraction = floor . fromRational . (* maxBound')
maxBound' :: Ratio Integer
maxBound' = fromIntegral (maxBound :: b) maxBound' = fromIntegral (maxBound :: b)
data Compressed a = Compressed data Compressed a = Compressed
{ markovs :: M.Map a (M.Map a (Word8)), { markovs :: M.Map a [(a, Word8)],
chunks :: [Chunk a]
}
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
data Chunk a = Chunk {
location :: Ratio Integer, location :: Ratio Integer,
start :: a, start :: a,
length :: Int length :: Int
} } deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
deriving (Eq, Ord, Show, Generic, C.Serialize)
decompress :: decompress ::
forall a. forall a.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
Compressed a -> Compressed a ->
[a] [a]
decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start) decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
where where
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal)
where
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size)) toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
decompressChunk :: Chunk a -> [a]
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
where
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
where
ansLoc :: Ratio Integer
newVal :: a
newSize :: Ratio Integer
(!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8] sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
sanityCheck = map (sum . M.elems) . M.elems sanityCheck = map (sum . M.elems) . M.elems
chunk chunkSize = chunk'
where
chunk' [] = []
chunk' xs = xs' : chunk' xs''
where
(xs', xs'') = splitAt chunkSize xs
chunkLength = 8000
compress :: compress ::
forall a. forall a.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
[a] -> [a] ->
Compressed a Compressed a
compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress) compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressChunk $ unCompressedChunks
where where
unCompressedChunks = chunk chunkLength toCompress
twoByteMarkovs = twoByteMarkov toCompress twoByteMarkovs = twoByteMarkov toCompress
rings = M.map toRing twoByteMarkovs rings = M.map toRing twoByteMarkovs
pairs = zip toCompress . tail $ toCompress compressChunk toCompress = Chunk {
location = shortestLocation endStats,
start = head toCompress,
length = L.length toCompress
}
where
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size) pairs = zip toCompress . tail $ toCompress
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
addWordMarkovStats endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
(WordMarkovStats {location = prevLoc, size = prevSize})
(WordMarkovStats {location = nextLoc, size = nextSize}) =
WordMarkovStats
{ location = prevLoc + (prevSize * nextLoc),
size = prevSize * nextSize
}
statsFor (x0, x1) = (rings M.! x0) M.! x1 statsFor (x0, x1) = (rings M.! x0) M.! x1
addWordMarkovStats
(WordMarkovStats {location = prevLoc, size = prevSize})
(WordMarkovStats {location = nextLoc, size = nextSize}) =
WordMarkovStats
{ location = prevLoc + (prevSize * nextLoc),
size = prevSize * nextSize
}
pyramidFold :: (a -> a -> a) -> [a] -> a
pyramidFold f = pyramid pyramidFold f = pyramid
where where
pyramid [x] = x pyramid [x] = x

View file

@ -52,7 +52,12 @@ main :: IO ()
main = do main = do
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy" -- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
f <- BS.getContents f <- BS.getContents
BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
let compressedUnencoded :: A.Compressed Word8 = (A.compress fAsWords)
let compressed = C.encode compressedUnencoded
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
-- let f = "hello tehre" -- let f = "hello tehre"
-- f <- BS.readFile "pg64317.txt" -- f <- BS.readFile "pg64317.txt"
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f -- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f