parallel compression and decompression
I've selected a chunk size of 8000 bytes rather arbitrarily, but it seems to work just fine. The bigger problem is that the simplified final fraction makes the running fraction in decompression's foldl more complicated as time goes on, making it around 50 times slower than compression. It takes just under an hour to decompress the Great Gatsby.
This commit is contained in:
parent
791fff6107
commit
46749895bc
3 changed files with 106 additions and 40 deletions
|
|
@ -53,7 +53,9 @@ executable compress
|
||||||
optparse-generic,
|
optparse-generic,
|
||||||
vector,
|
vector,
|
||||||
nonempty-containers,
|
nonempty-containers,
|
||||||
primes
|
witch,
|
||||||
|
monad-par,
|
||||||
|
monad-par-extras
|
||||||
default-language:
|
default-language:
|
||||||
GHC2021
|
GHC2021
|
||||||
other-modules:
|
other-modules:
|
||||||
|
|
@ -63,8 +65,9 @@ executable compress
|
||||||
Compress.PrefixTree
|
Compress.PrefixTree
|
||||||
Data.HuffmanTree
|
Data.HuffmanTree
|
||||||
Compress.Arithmetic
|
Compress.Arithmetic
|
||||||
|
Data.WordyMap
|
||||||
ghc-options:
|
ghc-options:
|
||||||
-threaded
|
-threaded
|
||||||
-fprof-auto
|
-fprof-auto
|
||||||
-fprof-late
|
-fprof-late
|
||||||
"-with-rtsopts=-p -hc"
|
"-with-rtsopts=-p -hc -B -N -qa"
|
||||||
|
|
|
||||||
|
|
@ -12,12 +12,13 @@ import Data.List (genericLength)
|
||||||
import Data.List qualified as L
|
import Data.List qualified as L
|
||||||
import Data.Map.Strict qualified as M
|
import Data.Map.Strict qualified as M
|
||||||
import Data.Maybe qualified as My
|
import Data.Maybe qualified as My
|
||||||
import Data.Numbers.Primes
|
|
||||||
import Data.Ord
|
import Data.Ord
|
||||||
import Data.Ratio
|
import Data.Ratio
|
||||||
import Data.Serialize qualified as C
|
import Data.Serialize qualified as C
|
||||||
import GHC.Generics (Generic)
|
import GHC.Generics (Generic)
|
||||||
import GHC.Natural (Natural)
|
import GHC.Natural (Natural)
|
||||||
|
import qualified Control.Monad.Par as P
|
||||||
|
import Control.Monad.Par (NFData)
|
||||||
|
|
||||||
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
||||||
|
|
||||||
|
|
@ -30,21 +31,32 @@ data WordMarkovStats = WordMarkovStats
|
||||||
}
|
}
|
||||||
deriving (Show, Eq, Ord, Generic, C.Serialize)
|
deriving (Show, Eq, Ord, Generic, C.Serialize)
|
||||||
|
|
||||||
toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats
|
toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
||||||
toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList)
|
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
|
||||||
where
|
where
|
||||||
asList = M.toList $ m'
|
|
||||||
sum' =
|
sizes = map ((% maxBound') . fromIntegral . snd) xs
|
||||||
sum
|
|
||||||
. map fromIntegral
|
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
|
||||||
. M.elems
|
|
||||||
$ m
|
asFracsOfTotalSum = map fst . tail . L.scanl' f (0, 0) $ sizes
|
||||||
m' = M.map ((% sum') . fromIntegral) m
|
|
||||||
|
|
||||||
maxBound' :: Integer
|
maxBound' :: Integer
|
||||||
maxBound' = fromIntegral (maxBound :: a)
|
maxBound' = fromIntegral (maxBound :: a)
|
||||||
|
|
||||||
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b)
|
withBumpedZeroSized
|
||||||
|
| numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum
|
||||||
|
| otherwise = asFracsOfTotalSum
|
||||||
|
|
||||||
|
remainingSpace = 1 - sum asFracsOfTotalSum
|
||||||
|
|
||||||
|
numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
|
||||||
|
|
||||||
|
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
|
||||||
|
where
|
||||||
|
newFrac = currFrac * (1 - runningSum)
|
||||||
|
|
||||||
|
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)]
|
||||||
twoByteMarkov xs =
|
twoByteMarkov xs =
|
||||||
M.map sizeAsFraction
|
M.map sizeAsFraction
|
||||||
. M.fromListWith (M.unionWith (+))
|
. M.fromListWith (M.unionWith (+))
|
||||||
|
|
@ -53,69 +65,115 @@ twoByteMarkov xs =
|
||||||
. tail
|
. tail
|
||||||
$ xs
|
$ xs
|
||||||
where
|
where
|
||||||
sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m
|
-- optimization, changes fractions so they represent the fraction
|
||||||
|
-- of remaining space in the list taken up instead of total space
|
||||||
|
sizeAsFraction :: M.Map k Integer -> [(k, b)]
|
||||||
|
sizeAsFraction m =
|
||||||
|
zip keys
|
||||||
|
. map discretizeFraction
|
||||||
|
. fractionOfRemainingSums
|
||||||
|
$ counts
|
||||||
where
|
where
|
||||||
sum' = sum . M.elems $ m
|
fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs
|
||||||
|
|
||||||
toInteger :: (Integral a) => a -> Integer
|
asList = L.sortOn (negate . snd) . M.assocs $ m
|
||||||
toInteger = fromIntegral
|
|
||||||
|
|
||||||
|
keys = map fst asList
|
||||||
|
|
||||||
|
counts = map snd asList
|
||||||
|
|
||||||
|
discretizeFraction :: Rational -> b
|
||||||
|
discretizeFraction = floor . fromRational . (* maxBound')
|
||||||
|
|
||||||
|
maxBound' :: Ratio Integer
|
||||||
maxBound' = fromIntegral (maxBound :: b)
|
maxBound' = fromIntegral (maxBound :: b)
|
||||||
|
|
||||||
data Compressed a = Compressed
|
data Compressed a = Compressed
|
||||||
{ markovs :: M.Map a (M.Map a (Word8)),
|
{ markovs :: M.Map a [(a, Word8)],
|
||||||
|
chunks :: [Chunk a]
|
||||||
|
}
|
||||||
|
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
|
||||||
|
|
||||||
|
data Chunk a = Chunk {
|
||||||
location :: Ratio Integer,
|
location :: Ratio Integer,
|
||||||
start :: a,
|
start :: a,
|
||||||
length :: Int
|
length :: Int
|
||||||
}
|
} deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
|
||||||
deriving (Eq, Ord, Show, Generic, C.Serialize)
|
|
||||||
|
|
||||||
decompress ::
|
decompress ::
|
||||||
forall a.
|
forall a.
|
||||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
|
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
|
||||||
Compressed a ->
|
Compressed a ->
|
||||||
[a]
|
[a]
|
||||||
decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start)
|
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
|
||||||
where
|
where
|
||||||
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
|
||||||
decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal)
|
|
||||||
where
|
|
||||||
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
|
|
||||||
|
|
||||||
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs
|
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs
|
||||||
|
|
||||||
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
||||||
|
|
||||||
|
decompressChunk :: Chunk a -> [a]
|
||||||
|
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
|
||||||
|
where
|
||||||
|
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
||||||
|
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
|
||||||
|
where
|
||||||
|
ansLoc :: Ratio Integer
|
||||||
|
newVal :: a
|
||||||
|
newSize :: Ratio Integer
|
||||||
|
(!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
|
||||||
|
|
||||||
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
||||||
sanityCheck = map (sum . M.elems) . M.elems
|
sanityCheck = map (sum . M.elems) . M.elems
|
||||||
|
|
||||||
|
chunk chunkSize = chunk'
|
||||||
|
where
|
||||||
|
chunk' [] = []
|
||||||
|
chunk' xs = xs' : chunk' xs''
|
||||||
|
where
|
||||||
|
(xs', xs'') = splitAt chunkSize xs
|
||||||
|
|
||||||
|
chunkLength = 8000
|
||||||
|
|
||||||
compress ::
|
compress ::
|
||||||
forall a.
|
forall a.
|
||||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
|
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
|
||||||
[a] ->
|
[a] ->
|
||||||
Compressed a
|
Compressed a
|
||||||
compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress)
|
compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressChunk $ unCompressedChunks
|
||||||
where
|
where
|
||||||
|
|
||||||
|
unCompressedChunks = chunk chunkLength toCompress
|
||||||
|
|
||||||
twoByteMarkovs = twoByteMarkov toCompress
|
twoByteMarkovs = twoByteMarkov toCompress
|
||||||
|
|
||||||
rings = M.map toRing twoByteMarkovs
|
rings = M.map toRing twoByteMarkovs
|
||||||
|
|
||||||
pairs = zip toCompress . tail $ toCompress
|
compressChunk toCompress = Chunk {
|
||||||
|
location = shortestLocation endStats,
|
||||||
|
start = head toCompress,
|
||||||
|
length = L.length toCompress
|
||||||
|
}
|
||||||
|
where
|
||||||
|
|
||||||
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
|
pairs = zip toCompress . tail $ toCompress
|
||||||
|
|
||||||
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
|
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
|
||||||
|
|
||||||
addWordMarkovStats
|
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
|
||||||
(WordMarkovStats {location = prevLoc, size = prevSize})
|
|
||||||
(WordMarkovStats {location = nextLoc, size = nextSize}) =
|
|
||||||
WordMarkovStats
|
|
||||||
{ location = prevLoc + (prevSize * nextLoc),
|
|
||||||
size = prevSize * nextSize
|
|
||||||
}
|
|
||||||
|
|
||||||
statsFor (x0, x1) = (rings M.! x0) M.! x1
|
statsFor (x0, x1) = (rings M.! x0) M.! x1
|
||||||
|
|
||||||
|
addWordMarkovStats
|
||||||
|
(WordMarkovStats {location = prevLoc, size = prevSize})
|
||||||
|
(WordMarkovStats {location = nextLoc, size = nextSize}) =
|
||||||
|
WordMarkovStats
|
||||||
|
{ location = prevLoc + (prevSize * nextLoc),
|
||||||
|
size = prevSize * nextSize
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
pyramidFold :: (a -> a -> a) -> [a] -> a
|
||||||
pyramidFold f = pyramid
|
pyramidFold f = pyramid
|
||||||
where
|
where
|
||||||
pyramid [x] = x
|
pyramid [x] = x
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,12 @@ main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
|
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
|
||||||
f <- BS.getContents
|
f <- BS.getContents
|
||||||
BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
|
let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
|
||||||
|
let compressedUnencoded :: A.Compressed Word8 = (A.compress fAsWords)
|
||||||
|
let compressed = C.encode compressedUnencoded
|
||||||
|
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
|
||||||
|
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
|
||||||
|
|
||||||
-- let f = "hello tehre"
|
-- let f = "hello tehre"
|
||||||
-- f <- BS.readFile "pg64317.txt"
|
-- f <- BS.readFile "pg64317.txt"
|
||||||
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
|
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue