From 791fff6107c35110c815244e5780f92321e97d85 Mon Sep 17 00:00:00 2001 From: Jack Wines Date: Sat, 21 Dec 2024 12:28:00 -0500 Subject: [PATCH] add arithmetic coding --- .gitignore | 1 + compress.cabal | 19 +++- flake.lock | 42 ++++---- flake.nix | 11 +- src/Compress/Arithmetic.hs | 149 ++++++++++++++++++++++++++ src/Compress/Huffman.hs | 125 ++++++++++++++++++++++ src/Compress/PrefixTree.hs | 188 +++++++++++++++++++++++++++++++++ src/Data/FiniteBit.hs | 45 ++++++++ src/Data/HuffmanTree.hs | 67 ++++++++++++ src/Data/PQueue.hs | 5 +- src/Main.hs | 209 +++++++++---------------------------- 11 files changed, 667 insertions(+), 194 deletions(-) create mode 100644 src/Compress/Arithmetic.hs create mode 100644 src/Compress/Huffman.hs create mode 100644 src/Compress/PrefixTree.hs create mode 100644 src/Data/FiniteBit.hs create mode 100644 src/Data/HuffmanTree.hs diff --git a/.gitignore b/.gitignore index d724111..a2245f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /dist-newstyle/ +/haddocks/ /result /.direnv/ **/.DS_Store diff --git a/compress.cabal b/compress.cabal index 8427e5b..8939f3c 100644 --- a/compress.cabal +++ b/compress.cabal @@ -43,6 +43,7 @@ executable compress src build-depends: base, + basement, bitvec, bytestring, cereal, @@ -50,12 +51,20 @@ executable compress text, uuid, optparse-generic, - vector + vector, + nonempty-containers, + primes default-language: GHC2021 other-modules: Data.PQueue - -- ghc-options: - -- -fprof-auto - -- -fprof-late - -- "-with-rtsopts=-p -hc" + Data.FiniteBit + Compress.Huffman + Compress.PrefixTree + Data.HuffmanTree + Compress.Arithmetic + ghc-options: + -threaded + -fprof-auto + -fprof-late + "-with-rtsopts=-p -hc" diff --git a/flake.lock b/flake.lock index 7e1dcac..ebcd77f 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1712014858, - "narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=", + "lastModified": 1733312601, + "narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "9126214d0a59633752a136528f5f3b9aa8565b7d", + "rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "haskell-flake": { "locked": { - "lastModified": 1713084600, - "narHash": "sha256-qL7LV2MtwJ+1Xasg1TjSUmoE7yrRuXPqxpPlKjLE0SE=", + "lastModified": 1734464164, + "narHash": "sha256-5JCCyrgy7IMnipyYMQzIAXncGt2XVlW1aK71A+FTXDs=", "owner": "srid", "repo": "haskell-flake", - "rev": "847292fc793a5c15c873e52e7751ee4267ef32a0", + "rev": "e280b39efdd72b6a5bdaa982b67f150c819be642", "type": "github" }, "original": { @@ -35,11 +35,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1712963716, - "narHash": "sha256-WKm9CvgCldeIVvRz87iOMi8CFVB1apJlkUT4GGvA0iM=", + "lastModified": 1734424634, + "narHash": "sha256-cHar1vqHOOyC7f1+tVycPoWTfKIaqkoe1Q6TnKzuti4=", "owner": "nixos", "repo": "nixpkgs", - "rev": "cfd6b5fc90b15709b780a5a1619695a88505a176", + "rev": "d3c42f187194c26d9f0309a8ecc469d6c878ce33", "type": "github" }, "original": { @@ -51,20 +51,14 @@ }, "nixpkgs-lib": { "locked": { - "dir": "lib", - "lastModified": 1711703276, - "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089", - "type": "github" + "lastModified": 1733096140, + "narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" }, "original": { - "dir": "lib", - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" } }, "root": { @@ -98,11 +92,11 @@ ] }, "locked": { - "lastModified": 1711963903, - "narHash": "sha256-N3QDhoaX+paWXHbEXZapqd1r95mdshxToGowtjtYkGI=", + "lastModified": 1733761991, + "narHash": "sha256-s4DalCDepD22jtKL5Nw6f4LP5UwoMcPzPZgHWjAfqbQ=", "owner": "numtide", "repo": "treefmt-nix", - "rev": "49dc4a92b02b8e68798abd99184f228243b6e3ac", + "rev": "0ce9d149d99bc383d1f2d85f31f6ebd146e46085", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index d5832a7..859754e 100644 --- a/flake.nix +++ b/flake.nix @@ -22,7 +22,7 @@ # See https://github.com/srid/haskell-flake/blob/master/example/flake.nix haskellProjects.default = { # The base package set (this value is the default) - # basePackages = pkgs.haskellPackages; + basePackages = pkgs.haskell.packages.ghc96; # Packages to add on top of `basePackages` packages = { @@ -34,6 +34,13 @@ # Add your package overrides here settings = { + uuid ={ + jailbreak = true; + }; + + # hlint = { + # jailbreak = true; + # }; # barbies-th = { # broken = false; # jailbreak = true; @@ -57,7 +64,7 @@ programs.ormolu.enable = true; programs.nixpkgs-fmt.enable = true; programs.cabal-fmt.enable = true; - programs.hlint.enable = true; + programs.hlint.enable = false; # We use fourmolu programs.ormolu.package = pkgs.haskellPackages.fourmolu; diff --git a/src/Compress/Arithmetic.hs b/src/Compress/Arithmetic.hs new file mode 100644 index 0000000..bf0f6ec --- /dev/null +++ b/src/Compress/Arithmetic.hs @@ -0,0 +1,149 @@ +-- | https://en.wikipedia.org/wiki/Arithmetic_coding +module Compress.Arithmetic where + +import Basement.Bits qualified as B +import Basement.Compat.Base (Word16, Word32, Word64, Word8) +import Basement.Compat.Bifunctor qualified as Bi +import Control.Arrow qualified as Ar +import Data.ByteString qualified as By +import Data.FiniteBit qualified as Fi +import Data.Foldable as F +import Data.List (genericLength) +import Data.List qualified as L +import Data.Map.Strict qualified as M +import Data.Maybe qualified as My +import Data.Numbers.Primes +import Data.Ord +import Data.Ratio +import Data.Serialize qualified as C +import GHC.Generics (Generic) +import GHC.Natural (Natural) + +-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes + +-- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer) +-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums + +data WordMarkovStats = WordMarkovStats + { location :: Ratio Integer, + size :: Ratio Integer + } + deriving (Show, Eq, Ord, Generic, C.Serialize) + +toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats +toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList) + where + asList = M.toList $ m' + sum' = + sum + . map fromIntegral + . M.elems + $ m + m' = M.map ((% sum') . fromIntegral) m + + maxBound' :: Integer + maxBound' = fromIntegral (maxBound :: a) + +twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b) +twoByteMarkov xs = + M.map sizeAsFraction + . M.fromListWith (M.unionWith (+)) + . zip xs + . map (`M.singleton` (1 :: Integer)) + . tail + $ xs + where + sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m + where + sum' = sum . M.elems $ m + + toInteger :: (Integral a) => a -> Integer + toInteger = fromIntegral + + maxBound' = fromIntegral (maxBound :: b) + +data Compressed a = Compressed + { markovs :: M.Map a (M.Map a (Word8)), + location :: Ratio Integer, + start :: a, + length :: Int + } + deriving (Eq, Ord, Show, Generic, C.Serialize) + +decompress :: + forall a. + (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => + Compressed a -> + [a] +decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start) + where + decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) + decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal) + where + (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev) + + rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs + + toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size)) + +sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8] +sanityCheck = map (sum . M.elems) . M.elems + +compress :: + forall a. + (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => + [a] -> + Compressed a +compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress) + where + twoByteMarkovs = twoByteMarkov toCompress + + rings = M.map toRing twoByteMarkovs + + pairs = zip toCompress . tail $ toCompress + + shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size) + + endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs + + addWordMarkovStats + (WordMarkovStats {location = prevLoc, size = prevSize}) + (WordMarkovStats {location = nextLoc, size = nextSize}) = + WordMarkovStats + { location = prevLoc + (prevSize * nextLoc), + size = prevSize * nextSize + } + + statsFor (x0, x1) = (rings M.! x0) M.! x1 + +pyramidFold f = pyramid + where + pyramid [x] = x + pyramid xs = pyramid $ pyramidFold' xs + + pyramidFold' [x] = [x] + pyramidFold' (x0 : x1 : []) = [f x0 x1] + pyramidFold' (x0 : x1 : xs) = (f x0 x1) : (pyramidFold' xs) + +-- borrowed and slightly changed from Data.Ratio source +simplestBetween :: Rational -> Rational -> Rational +simplestBetween x y + | x == y = x + | x > 0 = simplestBetween' n d n' d' + | otherwise = 0 % 1 + where + n = numerator x + d = denominator x + n' = numerator y + d' = denominator y + + simplestBetween' n d n' d' -- assumes 0 < n%d < n'%d' + | r == 0 = q % 1 + | q /= q' = (q + 1) % 1 + | otherwise = (q * n'' + d'') % n'' + where + (q, r) = quotRem n d + (q', r') = quotRem n' d' + nd'' = simplestBetween' d' r' d r + n'' = numerator nd'' + d'' = denominator nd'' diff --git a/src/Compress/Huffman.hs b/src/Compress/Huffman.hs new file mode 100644 index 0000000..582cd35 --- /dev/null +++ b/src/Compress/Huffman.hs @@ -0,0 +1,125 @@ +module Compress.Huffman where + + +import Data.Bifunctor qualified as Bi +import Data.FiniteBit +import Data.Bit (cloneToByteString) +import Data.Bit qualified as B +import Data.Bit qualified as BV +import Data.ByteString (fromFilePath) +import Data.ByteString qualified as BS +import Data.Data qualified as D +import Data.Foldable qualified as F +import Data.IntMap.Strict qualified as IM +import qualified Data.Proxy as D +import qualified Basement.From as F +import Data.Map.Strict qualified as M +import Basement.Bits as B +import Data.Maybe (fromMaybe) +import Data.Maybe qualified as My +import Data.PQueue qualified as PQ +import Data.Serialize qualified as C +import Data.Text qualified as T +import Data.Text.IO qualified as TIO +import Data.Vector.Unboxed qualified as V +import Data.Word +import Debug.Trace qualified as D +import GHC.Generics (Generic) +import Options.Generic qualified as O +import System.Environment qualified as SE +import Data.HuffmanTree + +decompress + :: forall a + . (Ord a, Integral a, B.FiniteBitsOps a) + => (TreeDirs, HuffmanTree a) + -> Maybe BS.ByteString +decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompress' treeDirs + where + decompress' :: [TreeDir] -> Maybe [a] + decompress' [] = Just [] + decompress' xs = case nextLeaf xs tree of + Nothing -> Nothing + Just (x, remainingDirs) -> (x :) <$> decompress' remainingDirs + + nextLeaf :: [TreeDir] -> HuffmanTree a -> Maybe (a, [TreeDir]) + nextLeaf xs (Leaf a) = Just (a, xs) + nextLeaf [] _ = Nothing + nextLeaf (L : xs) (Node {..}) = nextLeaf xs left + nextLeaf (R : xs) (Node {..}) = nextLeaf xs right + +compress + :: forall a + . (Ord a, Integral a, B.FiniteBitsOps a, B.BitOps a) + => BS.ByteString + -> Maybe (TreeDirs, HuffmanTree a) +compress bs = + liftA2 (,) (TreeDirs <$> treeDirections) mergedHuffmanTrees + where + treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString + + mergedHuffmanTrees = + mergeHuffmanTrees + . PQ.fromList + . map (uncurry (flip (,)) . Bi.first Leaf) + . counts + $ dividedByteString + + treeDirMap :: M.Map a [TreeDir] + treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees + + dividedByteString = toWordsList bs + +-- testCompression +-- :: forall a +-- . (Ord a, Eq a, Integral a, B.FiniteBitsOps a, B.BitOps a, C.Serialize a) +-- => D.Proxy a +-- -> BS.ByteString +-- -> Bool +-- testCompression _ bs = +-- ((Right . Just $ bs) ==) +-- . Bi.second (decompress :: Maybe (TreeDirs, HuffmanTree a) -> Maybe BS.ByteString) +-- -- . D.traceShowWith (Bi.second (fmap fst)) +-- . (decodeCompressed :: BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a))) +-- . encodeCompressed +-- -- . D.traceShowWith (fmap fst) +-- . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) +-- $ bs + +-- cloneToByteStringWithLen :: V.Vector BV.Bit -> (BS.ByteString, Int) +-- cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec) + +-- cloneFromByteStringWithLen :: (BS.ByteString, Int) -> V.Vector BV.Bit +-- cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs + +-- decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir] +-- decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen + +-- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a)) +-- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode + +counts :: (Ord a) => [a] -> [(a, Int)] +counts = M.toList . F.foldl' combiningInsert M.empty + where + combiningInsert m key = M.insertWith (+) key 1 m + +divideByteString :: Int -> BS.ByteString -> [BS.ByteString] +divideByteString n [] = [] +divideByteString n bs = x : divideByteString n xs + where + (x, xs) = BS.splitAt n bs + + +compressionRatioFor + :: forall a + . (Integral a, B.FiniteBitsOps a, B.BitOps a, Ord a, C.Serialize a) + => D.Proxy a + -> BS.ByteString + -> Double +compressionRatioFor proxy bs = + (/ (fromIntegral . BS.length $ bs)) + . fromIntegral + . BS.length + . C.encode + . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) + $ bs diff --git a/src/Compress/PrefixTree.hs b/src/Compress/PrefixTree.hs new file mode 100644 index 0000000..8143615 --- /dev/null +++ b/src/Compress/PrefixTree.hs @@ -0,0 +1,188 @@ +module Compress.PrefixTree where + +import Basement.Bits qualified as B +import qualified Basement.From as F +import Compress.Huffman qualified as H +import Control.Applicative qualified as A +import Data.Bifunctor qualified as Bi +import Data.ByteString qualified as BS +import Data.Foldable qualified as F +import Data.HuffmanTree as HT +import Data.List qualified as L +import Data.List.NonEmpty qualified as NE +import Data.Map.Strict qualified as M +import Data.Maybe qualified as My +import Data.Ord qualified as O +import Data.PQueue qualified as PQ +import Debug.Trace qualified as D +import Debug.Trace qualified as T +import Basement.Bits (FiniteBitsOps(numberOfBits)) +import GHC.Generics +import qualified Data.Serialize as C +import Data.FiniteBit + +data Tree a = (Ord a) => + Tree + { children :: M.Map a (Tree a) + } + +newtype HuffmanPrefixTree a b = HuffmanPrefixTree + { inner :: M.Map a (HuffmanTree b) + } deriving (Eq, Ord, Show, Generic, C.Serialize) + +finiteBitTupleUncons :: + forall a b. + (Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) => + BS.ByteString -> + Maybe ((a, b), BS.ByteString) +finiteBitTupleUncons bs = case finiteBitUncons bs of + Just (a, bs') -> case finiteBitUncons bs' of + Just (b, _) -> Just ((a, b), bs') + _ -> Nothing + _ -> Nothing + +fromByteString :: + forall a b. + (Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) => + BS.ByteString -> + [(a, b)] +fromByteString bs = case finiteBitTupleUncons bs of + Just ((a, b), bs') -> (a, b) : fromByteString bs' + Nothing -> [] + +toHuffmanTree :: + forall a b. + (Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) => + M.Map (a, b) Word -> + HuffmanPrefixTree a b +toHuffmanTree = + HuffmanPrefixTree + . M.mapMaybe HT.fromList + . M.fromListWith (++) + . map (\((a, b), count) -> (a, [(fromIntegral count, b)])) + . M.assocs + +decompress :: + forall a . + (Integral a, B.FiniteBitsOps a, B.BitOps a) => + (TreeDirs, HuffmanPrefixTree a a, a) + -> Maybe BS.ByteString +decompress (TreeDirs treeDirs'', HuffmanPrefixTree prefixTree, initial') = BS.concat . map toByteString . (initial' :) <$> decompress' treeDirs'' initial' + where + decompress' :: [TreeDir] -> a -> Maybe [a] + decompress' treeDirs initial = case HT.lookup (prefixTree M.! initial) treeDirs of + Nothing -> Nothing + Just (ans, []) -> Just [ans] + Just (ans, treeDirs') -> (ans :) <$> decompress' treeDirs' ans + +compress :: + forall a b. + (Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) => + BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a b, a) +compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> initial + where + tree :: HuffmanPrefixTree a b + tree = toHuffmanTree . nGramCounts $ bs + + treeDirMap :: M.Map a (M.Map b [TreeDir]) + treeDirMap = M.map HT.findTreeDirections . Compress.PrefixTree.inner $ tree + + initial :: Maybe a + initial = fst <$> finiteBitUncons bs + + asFiniteBitPairs :: [(a,b)] + asFiniteBitPairs = fromByteString bs + + treeDirsFor :: (a, b) -> [TreeDir] + treeDirsFor (a, b) = (treeDirMap M.! a) M.! b + + + +-- | all (M.null . children) . M.elems . children $ tree = +-- fmap End +-- . HT.fromList +-- . map (\x -> (prefixCounts x, x)) +-- . M.keys +-- . children +-- $ tree +-- | otherwise = +-- Just +-- . Layer +-- . M.mapMaybeWithKey (\key val -> toHuffmanTree' (key : soFar) val) +-- . children +-- $ tree +-- where +-- prefixCounts :: a -> Int +-- prefixCounts x = +-- fromIntegral +-- . sum +-- . M.elems +-- . M.filterWithKey (\key val -> L.isPrefixOf (reverse . (x :) $ soFar) key) +-- $ nGrams + +-- toHuffmanTree :: Tree a -> p1 -> HuffmanTree a +-- toHuffmanTree :: forall a . Tree a -> M.Map [a] Word -> HuffmanTree [a] +-- toHuffmanTree (Tree {..}) nGrams soFar | M.size children == 1 = Leaf . map (reverse . (: soFar)) . M.keys $ children +-- toHuffmanTree (Tree {..}) nGrams soFar = Leaf . map (reverse . (: soFar)) . M.keys $ children +-- where +-- sorted = L.sortBy (prefixCounts . fst) . M.toList $ children + +nGramCounts :: + forall a b. + (Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) => + BS.ByteString -> + M.Map (a, b) Word +nGramCounts = + M.fromListWith (+) + . map (,1) + . My.mapMaybe (My.listToMaybe . fromByteString) + . takeWhile ((== len) . BS.length) + . map (BS.take len) + . BS.tails + where + len = (`div` 8) . F.from $ numberOfBits (0 :: a) + numberOfBits (0 :: b) + +empty :: (Ord a) => Tree a +empty = Tree M.empty + +singleton :: (Ord a) => a -> Tree a +singleton x = Tree $ M.singleton x empty + +fromSingleList :: (Ord a) => [a] -> Tree a +fromSingleList [] = empty +fromSingleList (x : xs) = Tree . M.singleton x . fromSingleList $ xs + +fromList :: (Ord a) => [[a]] -> Tree a +fromList = F.foldl' merge empty . map fromSingleList + +-- insert :: Ord a => Tree a -> [a] -> Tree a +-- insert (Tree {..}) (x:xs) = + +merge :: Tree a -> Tree a -> Tree a +merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1 + +-- deriving instance Eq (Tree a) + +-- deriving instance Ord (Tree a) + +-- deriving instance (Show a) => Show (Tree a) + +-- empty :: (Ord a) => Tree a +-- empty = Tree M.empty + +-- fromList :: (Ord a, F.Foldable t) => t [a] -> Tree a +-- fromList = F.foldl' insert empty + +-- insert :: Tree a -> [a] -> Tree a +-- insert (Tree {..}) [] = Tree M.empty +-- insert (Tree {..}) (x : xs) = +-- Tree +-- . flip (M.insert x) children +-- . flip insert xs +-- . My.fromMaybe empty +-- . M.lookup x +-- $ children + +-- lookup :: (Ord a) => [a] -> Tree a -> Bool +-- lookup [] = const True +-- lookup (x : xs) = maybe False (Compress.PrefixTree.lookup xs) . M.lookup x . children diff --git a/src/Data/FiniteBit.hs b/src/Data/FiniteBit.hs new file mode 100644 index 0000000..253cb2b --- /dev/null +++ b/src/Data/FiniteBit.hs @@ -0,0 +1,45 @@ +module Data.FiniteBit where + +import Data.Bit (cloneToByteString) +import Data.Bit qualified as B +import Data.Bit qualified as BV +import Data.ByteString qualified as BS +import qualified Data.Proxy as D +import qualified Basement.From as F +import Data.Foldable qualified as F +import Data.Word +import Basement.Bits as B + + +numBytesIn :: forall a. (B.FiniteBitsOps a, Integral a) => D.Proxy a -> Int +numBytesIn _ = (`div` 8) . F.from . B.numberOfBits $ (0 :: a) + + +toWordsList :: forall a. (Integral a, B.FiniteBitsOps a, B.BitOps a) => BS.ByteString -> [a] +toWordsList bs = case finiteBitUncons bs of + Nothing -> [] + (Just (x, xs)) -> x : toWordsList xs + +toByteString :: forall a. (Integral a, B.FiniteBitsOps a) => a -> BS.ByteString +toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..] + where + numBytes = numBytesIn (D.Proxy :: D.Proxy a) + +finiteBitUncons + :: forall a + . (Integral a, B.FiniteBitsOps a, B.BitOps a) + => BS.ByteString + -> Maybe (a, BS.ByteString) +finiteBitUncons [] = Nothing +finiteBitUncons bs = + Just + . (,rest) + . F.foldl' (.|.) 0 + . zipWith (flip B.rotateR) [8, 16 ..] + . map (fromIntegral :: Word8 -> a) + . BS.unpack + $ takenBytes + where + takenBytes :: BS.ByteString + rest :: BS.ByteString + (takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs diff --git a/src/Data/HuffmanTree.hs b/src/Data/HuffmanTree.hs new file mode 100644 index 0000000..abef11f --- /dev/null +++ b/src/Data/HuffmanTree.hs @@ -0,0 +1,67 @@ +module Data.HuffmanTree where + +import Data.Bifunctor qualified as Bi +import Data.Map.Strict qualified as M +import Data.PQueue qualified as PQ +import Data.Serialize qualified as C +import GHC.Generics (Generic) +import Data.Vector.Unboxed qualified as V +import Data.Bit qualified as B +import Data.Bit qualified as BV +import Data.ByteString qualified as BS + +data HuffmanTree a + = Leaf a + | Node + { left :: HuffmanTree a, + right :: HuffmanTree a + } + deriving (Eq, Ord, Show, Generic, C.Serialize, Functor) + + +-- here so we can define our own Serialize instance +newtype TreeDirs = TreeDirs { + inner :: [TreeDir] + } deriving (Eq, Ord, Show) + +data TreeDir = L | R deriving (Eq, Ord, Show) + +instance C.Serialize TreeDirs where + put :: C.Putter TreeDirs + put = C.put . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) . (inner :: TreeDirs -> [TreeDir]) + + get :: C.Get TreeDirs + get = do + (bs, len) <- C.get + pure . TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs + +lookup :: + forall a. + (Ord a) => + HuffmanTree a -> + [TreeDir] -> + Maybe (a, [TreeDir]) +lookup (Node {..}) (L : xs) = Data.HuffmanTree.lookup left xs +lookup (Node {..}) (R : xs) = Data.HuffmanTree.lookup right xs +lookup (Leaf a) xs = Just (a, xs) +lookup _ [] = Nothing + +findTreeDirections :: + forall a. + (Ord a) => + HuffmanTree a -> + M.Map a [TreeDir] +findTreeDirections (Leaf a) = M.singleton a [] +findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right) + where + rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir] + rec' dir = M.map (dir :) . findTreeDirections + +mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a) +mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of + Nothing -> Nothing + Just (size, (x, Nothing)) -> Just x + Just (size, (x, Just (size1, (x', rest)))) -> mergeHuffmanTrees $ PQ.insert (size + size1) (Node x x') rest + +fromList :: [(Int, a)] -> Maybe (HuffmanTree a) +fromList = mergeHuffmanTrees . PQ.fromList . map (Bi.second Leaf) diff --git a/src/Data/PQueue.hs b/src/Data/PQueue.hs index ee82586..acb10f3 100644 --- a/src/Data/PQueue.hs +++ b/src/Data/PQueue.hs @@ -16,7 +16,10 @@ fromList = PQueue . IM.fromListWith NE.append . map (Bi.second NE.singleton) singleton :: Int -> a -> PQueue a singleton key a = PQueue $ IM.singleton key [a] -abstractView :: (IM.IntMap (NE.NonEmpty a) -> Maybe (IM.Key, NE.NonEmpty a)) -> PQueue a -> Maybe (IM.Key, (a, PQueue a)) +abstractView + :: (IM.IntMap (NE.NonEmpty a) -> Maybe (IM.Key, NE.NonEmpty a)) + -> PQueue a + -> Maybe (IM.Key, (a, PQueue a)) abstractView f (PQueue m) = case f m of Nothing -> Nothing (Just (key, x NE.:| (x' : xs))) -> Just (key, (x, PQueue $ IM.insert key (x' NE.:| xs) m)) diff --git a/src/Main.hs b/src/Main.hs index 7944151..a019f57 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,182 +1,67 @@ module Main where +import Basement.Bits qualified as B +import Compress.Huffman +import Compress.Huffman qualified as HT +import Compress.PrefixTree (HuffmanPrefixTree (HuffmanPrefixTree)) +import Compress.PrefixTree qualified as PT +import Compress.Arithmetic qualified as A import Data.Bifunctor qualified as Bi -import Data.Bit (cloneToByteString) import Data.Bit qualified as B -import Data.Bit qualified as BV import Data.Bits (Bits ((.|.))) import Data.Bits qualified as B -import Data.ByteString (fromFilePath) import Data.ByteString qualified as BS -import Data.Data qualified as D -import Data.Foldable qualified as F -import Data.IntMap.Strict qualified as IM -import Data.Map.Strict qualified as M -import Data.Maybe (fromMaybe) -import Data.Maybe qualified as My -import Data.PQueue qualified as PQ +import Data.HuffmanTree +import Data.Proxy qualified as P import Data.Serialize qualified as C -import Data.Text qualified as T -import Data.Text.IO qualified as TIO -import Data.Vector.Unboxed qualified as V import Data.Word -import Debug.Trace qualified as D import GHC.Generics (Generic) import Options.Generic qualified as O -import System.Environment qualified as SE +import qualified Data.FiniteBit as FB -data CompressOrDecompress = Compress | Decompress deriving (Show, Generic) +data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read) -instance O.ParseRecord CompressOrDecompress +data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read) -data HuffmanTree a - = Leaf a - | Node - {left :: HuffmanTree a, right :: HuffmanTree a} - deriving (Eq, Ord, Show, Generic, C.Serialize) +data CLIOpts = CLIOpts + { task :: CompressOrDecompress, + strategy :: CompressionStrategy + } + deriving (Show, Generic, O.ParseRecord) -data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize) +applyCompressionOptions :: + forall a. + (Integral a, B.BitOps a, B.FiniteBitsOps a, Ord a, C.Serialize a) => + P.Proxy a -> + CLIOpts -> + BS.ByteString -> + BS.ByteString +applyCompressionOptions _ (CLIOpts Compress Huffman) f = + C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f +applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f = + C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f +applyCompressionOptions _ (CLIOpts Decompress Huffman) f = + handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f +applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f = + handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f -findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir] -findTreeDirections (Leaf a) = M.singleton a [] -findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right) - where - rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir] - rec' dir = M.map (dir :) . findTreeDirections - -decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString -decompress Nothing = Just [] -decompress (Just (treeDirs, tree)) = BS.concat . map toByteString <$> decompress' treeDirs - where - decompress' :: [TreeDir] -> Maybe [a] - decompress' [] = Just [] - decompress' xs = case nextLeaf xs tree of - Nothing -> Nothing - Just (x, remainingDirs) -> (x :) <$> decompress' remainingDirs - - nextLeaf :: [TreeDir] -> HuffmanTree a -> Maybe (a, [TreeDir]) - nextLeaf xs (Leaf a) = Just (a, xs) - nextLeaf [] _ = Nothing - nextLeaf (L : xs) (Node {..}) = nextLeaf xs left - nextLeaf (R : xs) (Node {..}) = nextLeaf xs right - -compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a) -compress bs = - liftA2 (,) treeDirections mergedHuffmanTrees - where - treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString - - mergedHuffmanTrees = - mergeHuffmanTrees - . PQ.fromList - . map (uncurry (flip (,)) . Bi.first Leaf) - . counts - $ dividedByteString - - treeDirMap :: M.Map a [TreeDir] - treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees - - dividedByteString = toBitsList bs - -testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a, C.Serialize a) => D.Proxy a -> BS.ByteString -> Bool -testCompression _ bs = - ((Right . Just $ bs) ==) - . Bi.second (decompress :: Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString) - -- . D.traceShowWith (Bi.second (fmap fst)) - . (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a))) - . encodeCompressed - -- . D.traceShowWith (fmap fst) - . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)) - $ bs - -encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString -encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs) - where - encodeTreeDirs = cloneToByteStringWithLen . V.fromList . map (BV.Bit . (== R)) - -cloneToByteStringWithLen :: V.Vector BV.Bit -> (BS.ByteString, Int) -cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec) - -cloneFromByteStringWithLen :: (BS.ByteString, Int) -> V.Vector BV.Bit -cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs - -decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir] -decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen - -decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBits a, C.Serialize a) => BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a)) -decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode - -mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a) -mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of - Nothing -> Nothing - Just (size, (x, Nothing)) -> Just x - Just (size, (x, Just (size1, (x', rest)))) -> mergeHuffmanTrees $ PQ.insert (size + size1) (Node x x') rest - -counts :: (Ord a) => [a] -> [(a, Int)] -counts = M.toList . F.foldl' combiningInsert M.empty - where - combiningInsert m key = M.insertWith (+) key 1 m - -divideByteString :: Int -> BS.ByteString -> [BS.ByteString] -divideByteString n [] = [] -divideByteString n bs = x : divideByteString n xs - where - (x, xs) = BS.splitAt n bs - -toBitsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a] -toBitsList bs = case finiteBitUncons bs of - Nothing -> [] - (Just (x, xs)) -> x : (toBitsList xs) - -toByteString :: forall a. (Integral a, B.FiniteBits a) => a -> BS.ByteString -toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..] - where - numBytes = numBytesIn (D.Proxy :: D.Proxy a) - -finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString) -finiteBitUncons [] = Nothing -finiteBitUncons bs = - Just - . (,rest) - . F.foldl' (.|.) 0 - . zipWith (flip B.rotateR) [8, 16 ..] - . map (fromIntegral :: Word8 -> a) - . BS.unpack - $ takenBytes - where - takenBytes :: BS.ByteString - rest :: BS.ByteString - (takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs - -numBytesIn :: forall a. (B.FiniteBits a) => D.Proxy a -> Int -numBytesIn _ = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a) - -compressionRatioFor - :: forall a - . (Integral a, B.FiniteBits a, Ord a, C.Serialize a) - => D.Proxy a - -> BS.ByteString - -> Double -compressionRatioFor proxy bs = - (/ (fromIntegral . BS.length $ bs)) - . fromIntegral - . BS.length - . encodeCompressed - . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)) - $ bs - -applyCompressionOptions :: CompressOrDecompress -> BS.ByteString -> BS.ByteString -applyCompressionOptions Compress f = encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f -applyCompressionOptions Decompress f = case Bi.second decompress - . (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree Word8))) - $ f of - -- TODO: write errors to stderr - (Left _) -> [] - (Right Nothing) -> [] - (Right (Just bs)) -> bs +handleError (Right (Just bs)) = bs +handleError _ = [] main :: IO () main = do - compresionOrDecompression :: CompressOrDecompress <- O.getRecord "compression/decompression" + -- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy" f <- BS.getContents - BS.putStr . applyCompressionOptions compresionOrDecompression $ f + BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f + -- let f = "hello tehre" + -- f <- BS.readFile "pg64317.txt" + -- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f + -- print $ BS.length . C.encode $ compressed + -- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f + -- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f + -- BS.writeFile "outin.txt" decompressed + -- print (decompressed, f) + -- print $ BS.length decompressed + -- print $ BS.length f + -- print (decompressed == f) + -- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f