127 lines
4.1 KiB
Haskell
127 lines
4.1 KiB
Haskell
module Main where
|
|
|
|
import Data.Bifunctor qualified as Bi
|
|
import Data.Bit qualified as B
|
|
import Data.Bit qualified as BV
|
|
import Data.Bits (Bits ((.|.)))
|
|
import Data.Bits qualified as B
|
|
import Data.ByteString (fromFilePath)
|
|
import Data.ByteString qualified as BS
|
|
import Data.Data qualified as D
|
|
import Data.Foldable qualified as F
|
|
import Data.IntMap.Strict qualified as IM
|
|
import Data.Map.Strict qualified as M
|
|
import Data.Maybe (fromMaybe)
|
|
import Data.Maybe qualified as My
|
|
import Data.PQueue qualified as PQ
|
|
import Data.Serialize qualified as C
|
|
import Data.Text qualified as T
|
|
import Data.Text.IO qualified as TIO
|
|
import Data.Vector.Unboxed qualified as V
|
|
import Data.Word
|
|
import GHC.Generics (Generic)
|
|
import System.Environment qualified as SE
|
|
|
|
data HuffmanTree a
|
|
= Leaf a
|
|
| Node
|
|
{ left :: HuffmanTree a
|
|
, right :: HuffmanTree a
|
|
}
|
|
deriving (Eq, Ord, Show, Generic, C.Serialize)
|
|
|
|
data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize)
|
|
|
|
findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir]
|
|
findTreeDirections (Leaf a) = M.singleton a []
|
|
findTreeDirections (Node{..}) = M.union (rec' L left) (rec' R right)
|
|
where
|
|
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
|
|
rec' dir = M.map (dir :) . findTreeDirections
|
|
|
|
compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)
|
|
compress bs =
|
|
liftA2 (,) treeDirections mergedHuffmanTrees
|
|
where
|
|
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
|
|
|
|
mergedHuffmanTrees =
|
|
mergeHuffmanTrees
|
|
. PQ.fromList
|
|
. map (uncurry (flip (,)) . Bi.first Leaf)
|
|
. counts
|
|
$ dividedByteString
|
|
|
|
treeDirMap :: M.Map a [TreeDir]
|
|
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
|
|
|
|
dividedByteString = toBitsList bs
|
|
|
|
encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString
|
|
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
|
|
where
|
|
encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R))
|
|
|
|
mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a)
|
|
mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of
|
|
Nothing -> Nothing
|
|
Just (size, (x, Nothing)) -> Just x
|
|
Just (size, (x, Just (size1, (x', rest)))) -> mergeHuffmanTrees $ PQ.insert (size + size1) (Node x x') rest
|
|
|
|
counts :: (Ord a) => [a] -> [(a, Int)]
|
|
counts = M.toList . F.foldl' combiningInsert M.empty
|
|
where
|
|
combiningInsert m key = M.insertWith (+) key 1 m
|
|
|
|
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
|
|
divideByteString n [] = []
|
|
divideByteString n bs = x : divideByteString n xs
|
|
where
|
|
(x, xs) = BS.splitAt n bs
|
|
|
|
toBitsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a]
|
|
toBitsList bs = case finiteBitUncons bs of
|
|
Nothing -> []
|
|
(Just (x, xs)) -> x : (toBitsList xs)
|
|
|
|
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
|
|
finiteBitUncons [] = Nothing
|
|
finiteBitUncons bs =
|
|
Just
|
|
. (,rest)
|
|
. F.foldl' (.|.) 0
|
|
. zipWith (flip B.shiftL) [0, 8 ..]
|
|
. reverse
|
|
. map (fromIntegral :: Word8 -> a)
|
|
. BS.unpack
|
|
$ takenBytes
|
|
where
|
|
numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
|
|
|
|
takenBytes :: BS.ByteString
|
|
rest :: BS.ByteString
|
|
(takenBytes, rest) = BS.splitAt numBytes bs
|
|
|
|
compressionRatioFor ::
|
|
forall a.
|
|
(Integral a, B.FiniteBits a, Ord a, C.Serialize a) =>
|
|
D.Proxy a ->
|
|
BS.ByteString ->
|
|
Double
|
|
compressionRatioFor proxy bs =
|
|
(/ (fromIntegral . BS.length $ bs))
|
|
. fromIntegral
|
|
. BS.length
|
|
. encodeCompressed
|
|
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
|
|
$ bs
|
|
|
|
main :: IO ()
|
|
main = do
|
|
[filePath] <- SE.getArgs
|
|
f <- BS.readFile filePath
|
|
TIO.putStrLn "compression ratios:"
|
|
print . ("Word64",) . compressionRatioFor (D.Proxy :: D.Proxy Word64) $ f
|
|
print . ("Word32",) . compressionRatioFor (D.Proxy :: D.Proxy Word32) $ f
|
|
print . ("Word16",) . compressionRatioFor (D.Proxy :: D.Proxy Word16) $ f
|
|
print . ("Word8",) . compressionRatioFor (D.Proxy :: D.Proxy Word8) $ f
|