decompression works
I also added a cli interface
This commit is contained in:
parent
5bb8afb672
commit
43afd380c4
3 changed files with 69 additions and 27 deletions
12
README.md
12
README.md
|
|
@ -1,13 +1,19 @@
|
||||||
Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding). Please note that decompression isn't fully implemented yet. It is however implemented to the point where we know compression works correctly.
|
Compresses files using [Huffman coding](https://en.wikipedia.org/wiki/Huffman_coding).
|
||||||
|
|
||||||
On [The Great Gatsby in plain text](https://www.gutenberg.org/cache/epub/64317/pg64317.txt) on project Gutenberg, it achieves a 0.526 compression ratio when using a word size of two bytes. On an arbitrary RAW image, it peaks at .622 with a 32 bit word size and .645 with a 16 bit one.
|
On [The Great Gatsby in plain text](https://www.gutenberg.org/cache/epub/64317/pg64317.txt) on project Gutenberg, it achieves a 0.526 compression ratio when using a word size of two bytes. On an arbitrary RAW image, it peaks at .622 with a 32 bit word size and .645 with a 16 bit one.
|
||||||
|
|
||||||
To run with [Nix](nixos.org):
|
To run with [Nix](nixos.org):
|
||||||
``` sh
|
``` sh
|
||||||
nix run ".#" --experimental-features "nix-command flakes" -- myFile
|
nix run ".#" --experimental-features "nix-command flakes" -- compress < myFile > myFile.compressed
|
||||||
|
nix run ".#" --experimental-features "nix-command flakes" -- decompress < myFile.compressed > myFile-decompressed
|
||||||
|
sha256sum myFile myFile-decompressed
|
||||||
|
wc -c myFile.compressed myFile
|
||||||
```
|
```
|
||||||
|
|
||||||
To run with [cabal](https://www.haskell.org/ghcup/):
|
To run with [cabal](https://www.haskell.org/ghcup/):
|
||||||
``` sh
|
``` sh
|
||||||
cabal run . -- myFile
|
cabal run . -- compress < myFile > myFile.compressed
|
||||||
|
cabal run . -- decompress < myFile > myFile-decompressed
|
||||||
|
sha256sum myFile myFile-decompressed
|
||||||
|
wc -c myFile.compressed myFile
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ executable compress
|
||||||
StandaloneDeriving,
|
StandaloneDeriving,
|
||||||
StrictData,
|
StrictData,
|
||||||
TemplateHaskell,
|
TemplateHaskell,
|
||||||
|
LambdaCase,
|
||||||
TupleSections,
|
TupleSections,
|
||||||
TypeApplications,
|
TypeApplications,
|
||||||
TypeFamilies,
|
TypeFamilies,
|
||||||
|
|
@ -46,12 +47,13 @@ executable compress
|
||||||
containers,
|
containers,
|
||||||
text,
|
text,
|
||||||
uuid,
|
uuid,
|
||||||
|
optparse-generic,
|
||||||
vector
|
vector
|
||||||
default-language:
|
default-language:
|
||||||
GHC2021
|
GHC2021
|
||||||
other-modules:
|
other-modules:
|
||||||
Data.PQueue
|
Data.PQueue
|
||||||
ghc-options:
|
-- ghc-options:
|
||||||
-fprof-auto
|
-- -fprof-auto
|
||||||
-fprof-late
|
-- -fprof-late
|
||||||
"-with-rtsopts=-p -hc"
|
-- "-with-rtsopts=-p -hc"
|
||||||
|
|
|
||||||
74
src/Main.hs
74
src/Main.hs
|
|
@ -1,6 +1,7 @@
|
||||||
module Main where
|
module Main where
|
||||||
|
|
||||||
import Data.Bifunctor qualified as Bi
|
import Data.Bifunctor qualified as Bi
|
||||||
|
import Data.Bit (cloneToByteString)
|
||||||
import Data.Bit qualified as B
|
import Data.Bit qualified as B
|
||||||
import Data.Bit qualified as BV
|
import Data.Bit qualified as BV
|
||||||
import Data.Bits (Bits ((.|.)))
|
import Data.Bits (Bits ((.|.)))
|
||||||
|
|
@ -19,9 +20,15 @@ import Data.Text qualified as T
|
||||||
import Data.Text.IO qualified as TIO
|
import Data.Text.IO qualified as TIO
|
||||||
import Data.Vector.Unboxed qualified as V
|
import Data.Vector.Unboxed qualified as V
|
||||||
import Data.Word
|
import Data.Word
|
||||||
|
import Debug.Trace qualified as D
|
||||||
import GHC.Generics (Generic)
|
import GHC.Generics (Generic)
|
||||||
|
import Options.Generic qualified as O
|
||||||
import System.Environment qualified as SE
|
import System.Environment qualified as SE
|
||||||
|
|
||||||
|
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic)
|
||||||
|
|
||||||
|
instance O.ParseRecord CompressOrDecompress
|
||||||
|
|
||||||
data HuffmanTree a
|
data HuffmanTree a
|
||||||
= Leaf a
|
= Leaf a
|
||||||
| Node
|
| Node
|
||||||
|
|
@ -37,9 +44,9 @@ findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right)
|
||||||
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
|
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
|
||||||
rec' dir = M.map (dir :) . findTreeDirections
|
rec' dir = M.map (dir :) . findTreeDirections
|
||||||
|
|
||||||
decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe [a]
|
decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString
|
||||||
decompress Nothing = Just []
|
decompress Nothing = Just []
|
||||||
decompress (Just (treeDirs, tree)) = decompress' treeDirs
|
decompress (Just (treeDirs, tree)) = BS.concat . map toByteString <$> decompress' treeDirs
|
||||||
where
|
where
|
||||||
decompress' :: [TreeDir] -> Maybe [a]
|
decompress' :: [TreeDir] -> Maybe [a]
|
||||||
decompress' [] = Just []
|
decompress' [] = Just []
|
||||||
|
|
@ -71,13 +78,31 @@ compress bs =
|
||||||
|
|
||||||
dividedByteString = toBitsList bs
|
dividedByteString = toBitsList bs
|
||||||
|
|
||||||
testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a) => D.Proxy a -> BS.ByteString -> Bool
|
testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a, C.Serialize a) => D.Proxy a -> BS.ByteString -> Bool
|
||||||
testCompression _ bs = ((Just . toBitsList $ bs) :: Maybe [a]) == (decompress . compress $ bs)
|
testCompression _ bs =
|
||||||
|
((Right . Just $ bs) ==)
|
||||||
|
. Bi.second (decompress :: Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString)
|
||||||
|
-- . D.traceShowWith (Bi.second (fmap fst))
|
||||||
|
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a)))
|
||||||
|
. encodeCompressed
|
||||||
|
-- . D.traceShowWith (fmap fst)
|
||||||
|
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
|
||||||
|
$ bs
|
||||||
|
|
||||||
encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString
|
encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> (BS.ByteString)
|
||||||
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
|
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
|
||||||
where
|
where
|
||||||
encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R))
|
encodeTreeDirs = cloneToByteStringWithLen . V.fromList . map (BV.Bit . (== R))
|
||||||
|
|
||||||
|
cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec)
|
||||||
|
|
||||||
|
cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs
|
||||||
|
|
||||||
|
decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir]
|
||||||
|
decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen
|
||||||
|
|
||||||
|
decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBits a, C.Serialize a) => BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a))
|
||||||
|
decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
|
||||||
|
|
||||||
mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a)
|
mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a)
|
||||||
mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of
|
mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of
|
||||||
|
|
@ -101,23 +126,28 @@ toBitsList bs = case finiteBitUncons bs of
|
||||||
Nothing -> []
|
Nothing -> []
|
||||||
(Just (x, xs)) -> x : (toBitsList xs)
|
(Just (x, xs)) -> x : (toBitsList xs)
|
||||||
|
|
||||||
|
toByteString :: forall a. (Integral a, B.FiniteBits a) => a -> BS.ByteString
|
||||||
|
toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..]
|
||||||
|
where
|
||||||
|
numBytes = numBytesIn (D.Proxy :: D.Proxy a)
|
||||||
|
|
||||||
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
|
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
|
||||||
finiteBitUncons [] = Nothing
|
finiteBitUncons [] = Nothing
|
||||||
finiteBitUncons bs =
|
finiteBitUncons bs =
|
||||||
Just
|
Just
|
||||||
. (,rest)
|
. (,rest)
|
||||||
. F.foldl' (.|.) 0
|
. F.foldl' (.|.) 0
|
||||||
. zipWith (flip B.shiftL) [0, 8 ..]
|
. zipWith (flip B.rotateR) [8, 16 ..]
|
||||||
. reverse
|
|
||||||
. map (fromIntegral :: Word8 -> a)
|
. map (fromIntegral :: Word8 -> a)
|
||||||
. BS.unpack
|
. BS.unpack
|
||||||
$ takenBytes
|
$ takenBytes
|
||||||
where
|
where
|
||||||
numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
|
|
||||||
|
|
||||||
takenBytes :: BS.ByteString
|
takenBytes :: BS.ByteString
|
||||||
rest :: BS.ByteString
|
rest :: BS.ByteString
|
||||||
(takenBytes, rest) = BS.splitAt numBytes bs
|
(takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs
|
||||||
|
|
||||||
|
numBytesIn :: forall a. (B.FiniteBits a) => D.Proxy a -> Int
|
||||||
|
numBytesIn _ = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
|
||||||
|
|
||||||
compressionRatioFor
|
compressionRatioFor
|
||||||
:: forall a
|
:: forall a
|
||||||
|
|
@ -133,14 +163,18 @@ compressionRatioFor proxy bs =
|
||||||
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
|
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
|
||||||
$ bs
|
$ bs
|
||||||
|
|
||||||
|
applyCompressionOptions :: CompressOrDecompress -> BS.ByteString -> BS.ByteString
|
||||||
|
applyCompressionOptions Compress f = encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f
|
||||||
|
applyCompressionOptions Decompress f = case Bi.second decompress
|
||||||
|
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree Word8)))
|
||||||
|
$ f of
|
||||||
|
-- TODO: write errors to stderr
|
||||||
|
(Left _) -> []
|
||||||
|
(Right Nothing) -> []
|
||||||
|
(Right (Just bs)) -> bs
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
[filePath] <- SE.getArgs
|
compresionOrDecompression :: CompressOrDecompress <- O.getRecord "compression/decompression"
|
||||||
f <- BS.readFile filePath
|
f <- BS.getContents
|
||||||
TIO.putStrLn "does decompress . compress == id ?"
|
BS.putStr . applyCompressionOptions compresionOrDecompression $ f
|
||||||
print . ("Word64",) . testCompression (D.Proxy :: D.Proxy Word16) $ f
|
|
||||||
TIO.putStrLn "compression ratios:"
|
|
||||||
print . ("Word64",) . compressionRatioFor (D.Proxy :: D.Proxy Word64) $ f
|
|
||||||
print . ("Word32",) . compressionRatioFor (D.Proxy :: D.Proxy Word32) $ f
|
|
||||||
print . ("Word16",) . compressionRatioFor (D.Proxy :: D.Proxy Word16) $ f
|
|
||||||
print . ("Word8",) . compressionRatioFor (D.Proxy :: D.Proxy Word8) $ f
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue