decompression works

I also added a cli interface
This commit is contained in:
Jack Wines 2024-04-15 19:18:24 -07:00
parent 5bb8afb672
commit 43afd380c4
No known key found for this signature in database
GPG key ID: 25B20640600571E6
3 changed files with 69 additions and 27 deletions

View file

@ -1,13 +1,19 @@
Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding). Please note that decompression isn't fully implemented yet. It is however implemented to the point where we know compression works correctly.
Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding).
On [The Great Gatsby in plain text](https://www.gutenberg.org/cache/epub/64317/pg64317.txt) on project Gutenberg, it achieves a 0.526 compression ratio when using a word size of two bytes. On an arbitrary RAW image, it peaks at .622 with a 32 bit word size and .645 with a 16 bit one.
To run with [Nix](nixos.org):
``` sh
nix run ".#" --experimental-features "nix-command flakes" -- myFile
nix run ".#" --experimental-features "nix-command flakes" -- compress < myFile > myFile.compressed
nix run ".#" --experimental-features "nix-command flakes" -- decompress < myFile.compressed > myFile-decompressed
sha256sum myFile myFile-decompressed
wc -c myFile.compressed myFile
```
To run with [cabal](https://www.haskell.org/ghcup/):
``` sh
cabal run . -- myFile
cabal run . -- compress < myFile > myFile.compressed
cabal run . -- decompress < myFile > myFile-decompressed
sha256sum myFile myFile-decompressed
wc -c myFile.compressed myFile
```

View file

@ -30,6 +30,7 @@ executable compress
StandaloneDeriving,
StrictData,
TemplateHaskell,
LambdaCase,
TupleSections,
TypeApplications,
TypeFamilies,
@ -46,12 +47,13 @@ executable compress
containers,
text,
uuid,
optparse-generic,
vector
default-language:
GHC2021
other-modules:
Data.PQueue
ghc-options:
-fprof-auto
-fprof-late
"-with-rtsopts=-p -hc"
-- ghc-options:
-- -fprof-auto
-- -fprof-late
-- "-with-rtsopts=-p -hc"

View file

@ -1,6 +1,7 @@
module Main where
import Data.Bifunctor qualified as Bi
import Data.Bit (cloneToByteString)
import Data.Bit qualified as B
import Data.Bit qualified as BV
import Data.Bits (Bits ((.|.)))
@ -19,9 +20,15 @@ import Data.Text qualified as T
import Data.Text.IO qualified as TIO
import Data.Vector.Unboxed qualified as V
import Data.Word
import Debug.Trace qualified as D
import GHC.Generics (Generic)
import Options.Generic qualified as O
import System.Environment qualified as SE
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic)
instance O.ParseRecord CompressOrDecompress
data HuffmanTree a
= Leaf a
| Node
@ -37,9 +44,9 @@ findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right)
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
rec' dir = M.map (dir :) . findTreeDirections
decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe [a]
decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString
decompress Nothing = Just []
decompress (Just (treeDirs, tree)) = decompress' treeDirs
decompress (Just (treeDirs, tree)) = BS.concat . map toByteString <$> decompress' treeDirs
where
decompress' :: [TreeDir] -> Maybe [a]
decompress' [] = Just []
@ -71,13 +78,31 @@ compress bs =
dividedByteString = toBitsList bs
testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a) => D.Proxy a -> BS.ByteString -> Bool
testCompression _ bs = ((Just . toBitsList $ bs) :: Maybe [a]) == (decompress . compress $ bs)
testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a, C.Serialize a) => D.Proxy a -> BS.ByteString -> Bool
testCompression _ bs =
((Right . Just $ bs) ==)
. Bi.second (decompress :: Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString)
-- . D.traceShowWith (Bi.second (fmap fst))
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a)))
. encodeCompressed
-- . D.traceShowWith (fmap fst)
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
$ bs
encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString
encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> (BS.ByteString)
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
where
encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R))
encodeTreeDirs = cloneToByteStringWithLen . V.fromList . map (BV.Bit . (== R))
cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec)
cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs
decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir]
decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen
decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBits a, C.Serialize a) => BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a))
decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a)
mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of
@ -101,23 +126,28 @@ toBitsList bs = case finiteBitUncons bs of
Nothing -> []
(Just (x, xs)) -> x : (toBitsList xs)
toByteString :: forall a. (Integral a, B.FiniteBits a) => a -> BS.ByteString
toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..]
where
numBytes = numBytesIn (D.Proxy :: D.Proxy a)
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
finiteBitUncons [] = Nothing
finiteBitUncons bs =
Just
. (,rest)
. F.foldl' (.|.) 0
. zipWith (flip B.shiftL) [0, 8 ..]
. reverse
. zipWith (flip B.rotateR) [8, 16 ..]
. map (fromIntegral :: Word8 -> a)
. BS.unpack
$ takenBytes
where
numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
takenBytes :: BS.ByteString
rest :: BS.ByteString
(takenBytes, rest) = BS.splitAt numBytes bs
(takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs
numBytesIn :: forall a. (B.FiniteBits a) => D.Proxy a -> Int
numBytesIn _ = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
compressionRatioFor
:: forall a
@ -133,14 +163,18 @@ compressionRatioFor proxy bs =
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
$ bs
applyCompressionOptions :: CompressOrDecompress -> BS.ByteString -> BS.ByteString
applyCompressionOptions Compress f = encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f
applyCompressionOptions Decompress f = case Bi.second decompress
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree Word8)))
$ f of
-- TODO: write errors to stderr
(Left _) -> []
(Right Nothing) -> []
(Right (Just bs)) -> bs
main :: IO ()
main = do
[filePath] <- SE.getArgs
f <- BS.readFile filePath
TIO.putStrLn "does decompress . compress == id ?"
print . ("Word64",) . testCompression (D.Proxy :: D.Proxy Word16) $ f
TIO.putStrLn "compression ratios:"
print . ("Word64",) . compressionRatioFor (D.Proxy :: D.Proxy Word64) $ f
print . ("Word32",) . compressionRatioFor (D.Proxy :: D.Proxy Word32) $ f
print . ("Word16",) . compressionRatioFor (D.Proxy :: D.Proxy Word16) $ f
print . ("Word8",) . compressionRatioFor (D.Proxy :: D.Proxy Word8) $ f
compresionOrDecompression :: CompressOrDecompress <- O.getRecord "compression/decompression"
f <- BS.getContents
BS.putStr . applyCompressionOptions compresionOrDecompression $ f