diff --git a/README.md b/README.md index b2d1930..d9ec442 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,19 @@ -Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding). Please note that decompression isn't fully implemented yet. It is however implemented to the point where we know compression works correctly. +Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding). On [The Great Gatsby in plain text](https://www.gutenberg.org/cache/epub/64317/pg64317.txt) on project Gutenberg, it achieves a 0.526 compression ratio when using a word size of two bytes. On an arbitrary RAW image, it peaks at .622 with a 32 bit word size and .645 with a 16 bit one. To run with [Nix](nixos.org): ``` sh -nix run ".#" --experimental-features "nix-command flakes" -- myFile +nix run ".#" --experimental-features "nix-command flakes" -- compress < myFile > myFile.compressed +nix run ".#" --experimental-features "nix-command flakes" -- decompress < myFile.compressed > myFile-decompressed +sha256sum myFile myFile-decompressed +wc -c myFile.compressed myFile ``` To run with [cabal](https://www.haskell.org/ghcup/): ``` sh -cabal run . -- myFile +cabal run . -- compress < myFile > myFile.compressed +cabal run . -- decompress < myFile > myFile-decompressed +sha256sum myFile myFile-decompressed +wc -c myFile.compressed myFile ``` diff --git a/compress.cabal b/compress.cabal index fdd926f..037374c 100644 --- a/compress.cabal +++ b/compress.cabal @@ -30,6 +30,7 @@ executable compress StandaloneDeriving, StrictData, TemplateHaskell, + LambdaCase, TupleSections, TypeApplications, TypeFamilies, @@ -46,12 +47,13 @@ executable compress containers, text, uuid, + optparse-generic, vector default-language: GHC2021 other-modules: Data.PQueue - ghc-options: - -fprof-auto - -fprof-late - "-with-rtsopts=-p -hc" + -- ghc-options: + -- -fprof-auto + -- -fprof-late + -- "-with-rtsopts=-p -hc" diff --git a/src/Main.hs b/src/Main.hs index e0f64f5..c161d09 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,6 +1,7 @@ module Main where import Data.Bifunctor qualified as Bi +import Data.Bit (cloneToByteString) import Data.Bit qualified as B import Data.Bit qualified as BV import Data.Bits (Bits ((.|.))) @@ -19,9 +20,15 @@ import Data.Text qualified as T import Data.Text.IO qualified as TIO import Data.Vector.Unboxed qualified as V import Data.Word +import Debug.Trace qualified as D import GHC.Generics (Generic) +import Options.Generic qualified as O import System.Environment qualified as SE +data CompressOrDecompress = Compress | Decompress deriving (Show, Generic) + +instance O.ParseRecord CompressOrDecompress + data HuffmanTree a = Leaf a | Node @@ -37,9 +44,9 @@ findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right) rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir] rec' dir = M.map (dir :) . findTreeDirections -decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe [a] +decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString decompress Nothing = Just [] -decompress (Just (treeDirs, tree)) = decompress' treeDirs +decompress (Just (treeDirs, tree)) = BS.concat . map toByteString <$> decompress' treeDirs where decompress' :: [TreeDir] -> Maybe [a] decompress' [] = Just [] @@ -71,13 +78,31 @@ compress bs = dividedByteString = toBitsList bs -testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a) => D.Proxy a -> BS.ByteString -> Bool -testCompression _ bs = ((Just . toBitsList $ bs) :: Maybe [a]) == (decompress . compress $ bs) +testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a, C.Serialize a) => D.Proxy a -> BS.ByteString -> Bool +testCompression _ bs = + ((Right . Just $ bs) ==) + . Bi.second (decompress :: Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString) + -- . D.traceShowWith (Bi.second (fmap fst)) + . (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a))) + . encodeCompressed + -- . D.traceShowWith (fmap fst) + . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)) + $ bs -encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString +encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> (BS.ByteString) encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs) where - encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R)) + encodeTreeDirs = cloneToByteStringWithLen . V.fromList . map (BV.Bit . (== R)) + +cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec) + +cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs + +decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir] +decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen + +decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBits a, C.Serialize a) => BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a)) +decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a) mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of @@ -101,23 +126,28 @@ toBitsList bs = case finiteBitUncons bs of Nothing -> [] (Just (x, xs)) -> x : (toBitsList xs) +toByteString :: forall a. (Integral a, B.FiniteBits a) => a -> BS.ByteString +toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..] + where + numBytes = numBytesIn (D.Proxy :: D.Proxy a) + finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString) finiteBitUncons [] = Nothing finiteBitUncons bs = Just . (,rest) . F.foldl' (.|.) 0 - . zipWith (flip B.shiftL) [0, 8 ..] - . reverse + . zipWith (flip B.rotateR) [8, 16 ..] . map (fromIntegral :: Word8 -> a) . BS.unpack $ takenBytes where - numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a) - takenBytes :: BS.ByteString rest :: BS.ByteString - (takenBytes, rest) = BS.splitAt numBytes bs + (takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs + +numBytesIn :: forall a. (B.FiniteBits a) => D.Proxy a -> Int +numBytesIn _ = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a) compressionRatioFor :: forall a @@ -133,14 +163,18 @@ compressionRatioFor proxy bs = . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)) $ bs +applyCompressionOptions :: CompressOrDecompress -> BS.ByteString -> BS.ByteString +applyCompressionOptions Compress f = encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f +applyCompressionOptions Decompress f = case Bi.second decompress + . (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree Word8))) + $ f of + -- TODO: write errors to stderr + (Left _) -> [] + (Right Nothing) -> [] + (Right (Just bs)) -> bs + main :: IO () main = do - [filePath] <- SE.getArgs - f <- BS.readFile filePath - TIO.putStrLn "does decompress . compress == id ?" - print . ("Word64",) . testCompression (D.Proxy :: D.Proxy Word16) $ f - TIO.putStrLn "compression ratios:" - print . ("Word64",) . compressionRatioFor (D.Proxy :: D.Proxy Word64) $ f - print . ("Word32",) . compressionRatioFor (D.Proxy :: D.Proxy Word32) $ f - print . ("Word16",) . compressionRatioFor (D.Proxy :: D.Proxy Word16) $ f - print . ("Word8",) . compressionRatioFor (D.Proxy :: D.Proxy Word8) $ f + compresionOrDecompression :: CompressOrDecompress <- O.getRecord "compression/decompression" + f <- BS.getContents + BS.putStr . applyCompressionOptions compresionOrDecompression $ f