parallel compression and decompression

I've selected a chunk size of 8000 bytes rather arbitrarily, but it
seems to work just fine. The bigger problem is that the simplified
final fraction makes the running fraction in decompression's foldl
more complicated as time goes on, making it around 50 times slower
than compression. It takes just under an hour to decompress the
Great Gatsby.
This commit is contained in:
Jack Wines 2024-12-23 08:29:31 -05:00
parent 791fff6107
commit 46749895bc
Signed by: Jack
SSH key fingerprint: SHA256:AaP2Hr/e3mEjeY+s9XJmQqAesqEms8ENRhwRkpO0WUk
3 changed files with 106 additions and 40 deletions

View file

@ -53,7 +53,9 @@ executable compress
optparse-generic,
vector,
nonempty-containers,
primes
witch,
monad-par,
monad-par-extras
default-language:
GHC2021
other-modules:
@ -63,8 +65,9 @@ executable compress
Compress.PrefixTree
Data.HuffmanTree
Compress.Arithmetic
Data.WordyMap
ghc-options:
-threaded
-fprof-auto
-fprof-late
"-with-rtsopts=-p -hc"
"-with-rtsopts=-p -hc -B -N -qa"

View file

@ -12,12 +12,13 @@ import Data.List (genericLength)
import Data.List qualified as L
import Data.Map.Strict qualified as M
import Data.Maybe qualified as My
import Data.Numbers.Primes
import Data.Ord
import Data.Ratio
import Data.Serialize qualified as C
import GHC.Generics (Generic)
import GHC.Natural (Natural)
import qualified Control.Monad.Par as P
import Control.Monad.Par (NFData)
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
@ -30,21 +31,32 @@ data WordMarkovStats = WordMarkovStats
}
deriving (Show, Eq, Ord, Generic, C.Serialize)
toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats
toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList)
toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
where
asList = M.toList $ m'
sum' =
sum
. map fromIntegral
. M.elems
$ m
m' = M.map ((% sum') . fromIntegral) m
sizes = map ((% maxBound') . fromIntegral . snd) xs
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
asFracsOfTotalSum = map fst . tail . L.scanl' f (0, 0) $ sizes
maxBound' :: Integer
maxBound' = fromIntegral (maxBound :: a)
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b)
withBumpedZeroSized
| numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum
| otherwise = asFracsOfTotalSum
remainingSpace = 1 - sum asFracsOfTotalSum
numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
where
newFrac = currFrac * (1 - runningSum)
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)]
twoByteMarkov xs =
M.map sizeAsFraction
. M.fromListWith (M.unionWith (+))
@ -53,59 +65,105 @@ twoByteMarkov xs =
. tail
$ xs
where
sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m
-- optimization, changes fractions so they represent the fraction
-- of remaining space in the list taken up instead of total space
sizeAsFraction :: M.Map k Integer -> [(k, b)]
sizeAsFraction m =
zip keys
. map discretizeFraction
. fractionOfRemainingSums
$ counts
where
sum' = sum . M.elems $ m
fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs
toInteger :: (Integral a) => a -> Integer
toInteger = fromIntegral
asList = L.sortOn (negate . snd) . M.assocs $ m
keys = map fst asList
counts = map snd asList
discretizeFraction :: Rational -> b
discretizeFraction = floor . fromRational . (* maxBound')
maxBound' :: Ratio Integer
maxBound' = fromIntegral (maxBound :: b)
data Compressed a = Compressed
{ markovs :: M.Map a (M.Map a (Word8)),
{ markovs :: M.Map a [(a, Word8)],
chunks :: [Chunk a]
}
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
data Chunk a = Chunk {
location :: Ratio Integer,
start :: a,
length :: Int
}
deriving (Eq, Ord, Show, Generic, C.Serialize)
} deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
decompress ::
forall a.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
Compressed a ->
[a]
decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start)
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
where
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal)
where
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
decompressChunk :: Chunk a -> [a]
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
where
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
where
ansLoc :: Ratio Integer
newVal :: a
newSize :: Ratio Integer
(!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
sanityCheck = map (sum . M.elems) . M.elems
chunk chunkSize = chunk'
where
chunk' [] = []
chunk' xs = xs' : chunk' xs''
where
(xs', xs'') = splitAt chunkSize xs
chunkLength = 8000
compress ::
forall a.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
[a] ->
Compressed a
compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress)
compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressChunk $ unCompressedChunks
where
unCompressedChunks = chunk chunkLength toCompress
twoByteMarkovs = twoByteMarkov toCompress
rings = M.map toRing twoByteMarkovs
compressChunk toCompress = Chunk {
location = shortestLocation endStats,
start = head toCompress,
length = L.length toCompress
}
where
pairs = zip toCompress . tail $ toCompress
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
statsFor (x0, x1) = (rings M.! x0) M.! x1
addWordMarkovStats
(WordMarkovStats {location = prevLoc, size = prevSize})
(WordMarkovStats {location = nextLoc, size = nextSize}) =
@ -114,8 +172,8 @@ compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (hea
size = prevSize * nextSize
}
statsFor (x0, x1) = (rings M.! x0) M.! x1
pyramidFold :: (a -> a -> a) -> [a] -> a
pyramidFold f = pyramid
where
pyramid [x] = x

View file

@ -52,7 +52,12 @@ main :: IO ()
main = do
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
f <- BS.getContents
BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
let compressedUnencoded :: A.Compressed Word8 = (A.compress fAsWords)
let compressed = C.encode compressedUnencoded
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
-- let f = "hello tehre"
-- f <- BS.readFile "pg64317.txt"
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f