diff --git a/.gitignore b/.gitignore index 39c1eec..d4c8ed4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /result /.direnv/ **/.DS_Store +/cabal.project.local diff --git a/compress.cabal b/compress.cabal index 84365e1..fdd926f 100644 --- a/compress.cabal +++ b/compress.cabal @@ -50,4 +50,8 @@ executable compress default-language: GHC2021 other-modules: - Data.SortedList + Data.PQueue + ghc-options: + -fprof-auto + -fprof-late + "-with-rtsopts=-p -hc" diff --git a/src/Data/PQueue.hs b/src/Data/PQueue.hs new file mode 100644 index 0000000..3193376 --- /dev/null +++ b/src/Data/PQueue.hs @@ -0,0 +1,26 @@ +module Data.PQueue where + +import Data.Bifunctor qualified as Bi +import Data.IntMap.Strict qualified as IM +import Data.List qualified as L +import Data.List.NonEmpty qualified as NE + +newtype PQueue a = PQueue + { toMap :: IM.IntMap (NE.NonEmpty a) + } + deriving (Functor, Show) + +fromList :: [(Int, a)] -> PQueue a +fromList = PQueue . IM.fromListWith NE.append . map (Bi.second NE.singleton) + +singleton :: Int -> a -> PQueue a +singleton key a = PQueue $ IM.singleton key [a] + +minView :: PQueue a -> Maybe (Int, (a, PQueue a)) +minView (PQueue m) = case IM.lookupMin m of + Nothing -> Nothing + (Just (key, x NE.:| (x' : xs))) -> Just (key, (x, PQueue $ IM.insert key (x' NE.:| xs) m)) + (Just (key, x NE.:| [])) -> Just (key, (x, PQueue $ IM.delete key m)) + +insert :: Int -> p -> PQueue p -> PQueue p +insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . toMap diff --git a/src/Data/SortedList.hs b/src/Data/SortedList.hs deleted file mode 100644 index 0d2d601..0000000 --- a/src/Data/SortedList.hs +++ /dev/null @@ -1,27 +0,0 @@ -module Data.SortedList where -import qualified Data.List as L - -data SortedList a = Ord a => SortedList { - toList :: [a] -} - -deriving instance Show a => Show (SortedList a) - -map :: Ord a1 => ([a2] -> [a1]) -> SortedList a2 -> SortedList a1 -map f (SortedList a) = SortedList . f $ a - -fromList :: Ord a => [a] -> SortedList a -fromList = SortedList . L.sort - -singleton :: Ord a => a -> SortedList a -singleton a = SortedList [a] - -minView :: SortedList a -> Maybe (a, SortedList a) -minView (SortedList (x:xs)) = Just (x, SortedList xs) -minView (SortedList _) = Nothing - -insert :: p -> SortedList p -> SortedList p -insert x (SortedList []) = singleton x -insert x (SortedList (x' : xs)) - | x <= x' = SortedList (x : x' : xs) - | otherwise = Data.SortedList.map (x' :) . insert x . SortedList $ xs diff --git a/src/Main.hs b/src/Main.hs index 3ac3f27..291a35a 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -7,13 +7,14 @@ import Data.Bits (Bits ((.|.))) import Data.Bits qualified as B import Data.ByteString (fromFilePath) import Data.ByteString qualified as BS +import Data.Data qualified as D import Data.Foldable qualified as F import Data.IntMap.Strict qualified as IM import Data.Map.Strict qualified as M import Data.Maybe (fromMaybe) import Data.Maybe qualified as My +import Data.PQueue qualified as PQ import Data.Serialize qualified as C -import Data.SortedList qualified as SL import Data.Text qualified as T import Data.Text.IO qualified as TIO import Data.Vector.Unboxed qualified as V @@ -29,12 +30,6 @@ data HuffmanTree a } deriving (Eq, Ord, Show, Generic, C.Serialize) -data SizedHuffmanTree a = SizedHuffmanTree - { size :: Int - , tree :: HuffmanTree a - } - deriving (Eq, Ord, Show) - data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize) findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir] @@ -44,48 +39,34 @@ findTreeDirections (Node{..}) = M.union (rec' L left) (rec' R right) rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir] rec' dir = M.map (dir :) . findTreeDirections -mergeSizedTrees tree0 tree1 = - SizedHuffmanTree - { size = (size tree0) + (size tree1) - , tree = Node (tree tree0) (tree tree1) - } - --- compress :: BS.ByteString -> Maybe (SizedHuffmanTree BS.ByteString) compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a) compress bs = - liftA2 (,) treeDirections unsizedTree + liftA2 (,) treeDirections mergedHuffmanTrees where - unsizedTree :: Maybe (HuffmanTree a) - unsizedTree = tree <$> mergedHuffmanTrees - treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString mergedHuffmanTrees = mergeHuffmanTrees - . SL.fromList - . map (uncurry (flip SizedHuffmanTree) . Bi.first Leaf) + . PQ.fromList + . map (uncurry (flip (,)) . Bi.first Leaf) . counts $ dividedByteString treeDirMap :: M.Map a [TreeDir] - treeDirMap = My.maybe M.empty (findTreeDirections . tree) mergedHuffmanTrees + treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees dividedByteString = toBitsList bs -encodeCompressed :: C.Serialize a => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString +encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs) where encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R)) -mergeHuffmanTrees :: SL.SortedList (SizedHuffmanTree a) -> Maybe (SizedHuffmanTree a) -mergeHuffmanTrees (SL.SortedList (tree0 : tree1 : xs)) = - mergeHuffmanTrees - . SL.insert (mergeSizedTrees tree0 tree1) - . SL.SortedList - $ xs - -mergeHuffmanTrees (SL.SortedList [x]) = Just x -mergeHuffmanTrees (SL.SortedList []) = Nothing +mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a) +mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of + Nothing -> Nothing + Just (size, (x, Nothing)) -> Just x + Just (size, (x, Just (size1, (x', rest)))) -> mergeHuffmanTrees $ PQ.insert (size + size1) (Node x x') rest counts :: (Ord a) => [a] -> [(a, Int)] counts = M.toList . F.foldl' combiningInsert M.empty @@ -121,14 +102,26 @@ finiteBitUncons bs = rest :: BS.ByteString (takenBytes, rest) = BS.splitAt numBytes bs +compressionRatioFor :: + forall a. + (Integral a, B.FiniteBits a, Ord a, C.Serialize a) => + D.Proxy a -> + BS.ByteString -> + Double +compressionRatioFor proxy bs = + (/ (fromIntegral . BS.length $ bs)) + . fromIntegral + . BS.length + . encodeCompressed + . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)) + $ bs + main :: IO () main = do [filePath] <- SE.getArgs f <- BS.readFile filePath - TIO.putStrLn "original:" - print . BS.length $ f - TIO.putStrLn "rest:" - print . ("Word64",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word64)) $ f - print . ("Word32",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word32)) $ f - print . ("Word16",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word16)) $ f - print . ("Word8",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f + TIO.putStrLn "compression ratios:" + print . ("Word64",) . compressionRatioFor (D.Proxy :: D.Proxy Word64) $ f + print . ("Word32",) . compressionRatioFor (D.Proxy :: D.Proxy Word32) $ f + print . ("Word16",) . compressionRatioFor (D.Proxy :: D.Proxy Word16) $ f + print . ("Word8",) . compressionRatioFor (D.Proxy :: D.Proxy Word8) $ f