decompression works

I also added a cli interface
This commit is contained in:
Jack Wines 2024-04-15 19:18:24 -07:00
parent 5bb8afb672
commit 43afd380c4
No known key found for this signature in database
GPG key ID: 25B20640600571E6
3 changed files with 69 additions and 27 deletions

View file

@ -1,13 +1,19 @@
Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding). Please note that decompression isn't fully implemented yet. It is however implemented to the point where we know compression works correctly. Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding).
On [The Great Gatsby in plain text](https://www.gutenberg.org/cache/epub/64317/pg64317.txt) on project Gutenberg, it achieves a 0.526 compression ratio when using a word size of two bytes. On an arbitrary RAW image, it peaks at .622 with a 32 bit word size and .645 with a 16 bit one. On [The Great Gatsby in plain text](https://www.gutenberg.org/cache/epub/64317/pg64317.txt) on project Gutenberg, it achieves a 0.526 compression ratio when using a word size of two bytes. On an arbitrary RAW image, it peaks at .622 with a 32 bit word size and .645 with a 16 bit one.
To run with [Nix](nixos.org): To run with [Nix](nixos.org):
``` sh ``` sh
nix run ".#" --experimental-features "nix-command flakes" -- myFile nix run ".#" --experimental-features "nix-command flakes" -- compress < myFile > myFile.compressed
nix run ".#" --experimental-features "nix-command flakes" -- decompress < myFile.compressed > myFile-decompressed
sha256sum myFile myFile-decompressed
wc -c myFile.compressed myFile
``` ```
To run with [cabal](https://www.haskell.org/ghcup/): To run with [cabal](https://www.haskell.org/ghcup/):
``` sh ``` sh
cabal run . -- myFile cabal run . -- compress < myFile > myFile.compressed
cabal run . -- decompress < myFile > myFile-decompressed
sha256sum myFile myFile-decompressed
wc -c myFile.compressed myFile
``` ```

View file

@ -30,6 +30,7 @@ executable compress
StandaloneDeriving, StandaloneDeriving,
StrictData, StrictData,
TemplateHaskell, TemplateHaskell,
LambdaCase,
TupleSections, TupleSections,
TypeApplications, TypeApplications,
TypeFamilies, TypeFamilies,
@ -46,12 +47,13 @@ executable compress
containers, containers,
text, text,
uuid, uuid,
optparse-generic,
vector vector
default-language: default-language:
GHC2021 GHC2021
other-modules: other-modules:
Data.PQueue Data.PQueue
ghc-options: -- ghc-options:
-fprof-auto -- -fprof-auto
-fprof-late -- -fprof-late
"-with-rtsopts=-p -hc" -- "-with-rtsopts=-p -hc"

View file

@ -1,6 +1,7 @@
module Main where module Main where
import Data.Bifunctor qualified as Bi import Data.Bifunctor qualified as Bi
import Data.Bit (cloneToByteString)
import Data.Bit qualified as B import Data.Bit qualified as B
import Data.Bit qualified as BV import Data.Bit qualified as BV
import Data.Bits (Bits ((.|.))) import Data.Bits (Bits ((.|.)))
@ -19,9 +20,15 @@ import Data.Text qualified as T
import Data.Text.IO qualified as TIO import Data.Text.IO qualified as TIO
import Data.Vector.Unboxed qualified as V import Data.Vector.Unboxed qualified as V
import Data.Word import Data.Word
import Debug.Trace qualified as D
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Options.Generic qualified as O
import System.Environment qualified as SE import System.Environment qualified as SE
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic)
instance O.ParseRecord CompressOrDecompress
data HuffmanTree a data HuffmanTree a
= Leaf a = Leaf a
| Node | Node
@ -37,9 +44,9 @@ findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right)
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir] rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
rec' dir = M.map (dir :) . findTreeDirections rec' dir = M.map (dir :) . findTreeDirections
decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe [a] decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString
decompress Nothing = Just [] decompress Nothing = Just []
decompress (Just (treeDirs, tree)) = decompress' treeDirs decompress (Just (treeDirs, tree)) = BS.concat . map toByteString <$> decompress' treeDirs
where where
decompress' :: [TreeDir] -> Maybe [a] decompress' :: [TreeDir] -> Maybe [a]
decompress' [] = Just [] decompress' [] = Just []
@ -71,13 +78,31 @@ compress bs =
dividedByteString = toBitsList bs dividedByteString = toBitsList bs
testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a) => D.Proxy a -> BS.ByteString -> Bool testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a, C.Serialize a) => D.Proxy a -> BS.ByteString -> Bool
testCompression _ bs = ((Just . toBitsList $ bs) :: Maybe [a]) == (decompress . compress $ bs) testCompression _ bs =
((Right . Just $ bs) ==)
. Bi.second (decompress :: Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString)
-- . D.traceShowWith (Bi.second (fmap fst))
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a)))
. encodeCompressed
-- . D.traceShowWith (fmap fst)
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
$ bs
encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> (BS.ByteString)
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs) encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
where where
encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R)) encodeTreeDirs = cloneToByteStringWithLen . V.fromList . map (BV.Bit . (== R))
cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec)
cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs
decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir]
decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen
decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBits a, C.Serialize a) => BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a))
decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a) mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a)
mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of
@ -101,23 +126,28 @@ toBitsList bs = case finiteBitUncons bs of
Nothing -> [] Nothing -> []
(Just (x, xs)) -> x : (toBitsList xs) (Just (x, xs)) -> x : (toBitsList xs)
toByteString :: forall a. (Integral a, B.FiniteBits a) => a -> BS.ByteString
toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..]
where
numBytes = numBytesIn (D.Proxy :: D.Proxy a)
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString) finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
finiteBitUncons [] = Nothing finiteBitUncons [] = Nothing
finiteBitUncons bs = finiteBitUncons bs =
Just Just
. (,rest) . (,rest)
. F.foldl' (.|.) 0 . F.foldl' (.|.) 0
. zipWith (flip B.shiftL) [0, 8 ..] . zipWith (flip B.rotateR) [8, 16 ..]
. reverse
. map (fromIntegral :: Word8 -> a) . map (fromIntegral :: Word8 -> a)
. BS.unpack . BS.unpack
$ takenBytes $ takenBytes
where where
numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
takenBytes :: BS.ByteString takenBytes :: BS.ByteString
rest :: BS.ByteString rest :: BS.ByteString
(takenBytes, rest) = BS.splitAt numBytes bs (takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs
numBytesIn :: forall a. (B.FiniteBits a) => D.Proxy a -> Int
numBytesIn _ = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
compressionRatioFor compressionRatioFor
:: forall a :: forall a
@ -133,14 +163,18 @@ compressionRatioFor proxy bs =
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)) . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
$ bs $ bs
applyCompressionOptions :: CompressOrDecompress -> BS.ByteString -> BS.ByteString
applyCompressionOptions Compress f = encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f
applyCompressionOptions Decompress f = case Bi.second decompress
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree Word8)))
$ f of
-- TODO: write errors to stderr
(Left _) -> []
(Right Nothing) -> []
(Right (Just bs)) -> bs
main :: IO () main :: IO ()
main = do main = do
[filePath] <- SE.getArgs compresionOrDecompression :: CompressOrDecompress <- O.getRecord "compression/decompression"
f <- BS.readFile filePath f <- BS.getContents
TIO.putStrLn "does decompress . compress == id ?" BS.putStr . applyCompressionOptions compresionOrDecompression $ f
print . ("Word64",) . testCompression (D.Proxy :: D.Proxy Word16) $ f
TIO.putStrLn "compression ratios:"
print . ("Word64",) . compressionRatioFor (D.Proxy :: D.Proxy Word64) $ f
print . ("Word32",) . compressionRatioFor (D.Proxy :: D.Proxy Word32) $ f
print . ("Word16",) . compressionRatioFor (D.Proxy :: D.Proxy Word16) $ f
print . ("Word8",) . compressionRatioFor (D.Proxy :: D.Proxy Word8) $ f