add arithmetic coding

This commit is contained in:
Jack Wines 2024-12-21 12:28:00 -05:00
parent 6d00525334
commit 791fff6107
Signed by: Jack
SSH key fingerprint: SHA256:AaP2Hr/e3mEjeY+s9XJmQqAesqEms8ENRhwRkpO0WUk
11 changed files with 667 additions and 194 deletions

1
.gitignore vendored
View file

@ -1,4 +1,5 @@
/dist-newstyle/
/haddocks/
/result
/.direnv/
**/.DS_Store

View file

@ -43,6 +43,7 @@ executable compress
src
build-depends:
base,
basement,
bitvec,
bytestring,
cereal,
@ -50,12 +51,20 @@ executable compress
text,
uuid,
optparse-generic,
vector
vector,
nonempty-containers,
primes
default-language:
GHC2021
other-modules:
Data.PQueue
-- ghc-options:
-- -fprof-auto
-- -fprof-late
-- "-with-rtsopts=-p -hc"
Data.FiniteBit
Compress.Huffman
Compress.PrefixTree
Data.HuffmanTree
Compress.Arithmetic
ghc-options:
-threaded
-fprof-auto
-fprof-late
"-with-rtsopts=-p -hc"

42
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1712014858,
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
"lastModified": 1733312601,
"narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
"rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9",
"type": "github"
},
"original": {
@ -20,11 +20,11 @@
},
"haskell-flake": {
"locked": {
"lastModified": 1713084600,
"narHash": "sha256-qL7LV2MtwJ+1Xasg1TjSUmoE7yrRuXPqxpPlKjLE0SE=",
"lastModified": 1734464164,
"narHash": "sha256-5JCCyrgy7IMnipyYMQzIAXncGt2XVlW1aK71A+FTXDs=",
"owner": "srid",
"repo": "haskell-flake",
"rev": "847292fc793a5c15c873e52e7751ee4267ef32a0",
"rev": "e280b39efdd72b6a5bdaa982b67f150c819be642",
"type": "github"
},
"original": {
@ -35,11 +35,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1712963716,
"narHash": "sha256-WKm9CvgCldeIVvRz87iOMi8CFVB1apJlkUT4GGvA0iM=",
"lastModified": 1734424634,
"narHash": "sha256-cHar1vqHOOyC7f1+tVycPoWTfKIaqkoe1Q6TnKzuti4=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "cfd6b5fc90b15709b780a5a1619695a88505a176",
"rev": "d3c42f187194c26d9f0309a8ecc469d6c878ce33",
"type": "github"
},
"original": {
@ -51,20 +51,14 @@
},
"nixpkgs-lib": {
"locked": {
"dir": "lib",
"lastModified": 1711703276,
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
"type": "github"
"lastModified": 1733096140,
"narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
},
"original": {
"dir": "lib",
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
}
},
"root": {
@ -98,11 +92,11 @@
]
},
"locked": {
"lastModified": 1711963903,
"narHash": "sha256-N3QDhoaX+paWXHbEXZapqd1r95mdshxToGowtjtYkGI=",
"lastModified": 1733761991,
"narHash": "sha256-s4DalCDepD22jtKL5Nw6f4LP5UwoMcPzPZgHWjAfqbQ=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "49dc4a92b02b8e68798abd99184f228243b6e3ac",
"rev": "0ce9d149d99bc383d1f2d85f31f6ebd146e46085",
"type": "github"
},
"original": {

View file

@ -22,7 +22,7 @@
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
haskellProjects.default = {
# The base package set (this value is the default)
# basePackages = pkgs.haskellPackages;
basePackages = pkgs.haskell.packages.ghc96;
# Packages to add on top of `basePackages`
packages = {
@ -34,6 +34,13 @@
# Add your package overrides here
settings = {
uuid ={
jailbreak = true;
};
# hlint = {
# jailbreak = true;
# };
# barbies-th = {
# broken = false;
# jailbreak = true;
@ -57,7 +64,7 @@
programs.ormolu.enable = true;
programs.nixpkgs-fmt.enable = true;
programs.cabal-fmt.enable = true;
programs.hlint.enable = true;
programs.hlint.enable = false;
# We use fourmolu
programs.ormolu.package = pkgs.haskellPackages.fourmolu;

149
src/Compress/Arithmetic.hs Normal file
View file

@ -0,0 +1,149 @@
-- | https://en.wikipedia.org/wiki/Arithmetic_coding
module Compress.Arithmetic where
import Basement.Bits qualified as B
import Basement.Compat.Base (Word16, Word32, Word64, Word8)
import Basement.Compat.Bifunctor qualified as Bi
import Control.Arrow qualified as Ar
import Data.ByteString qualified as By
import Data.FiniteBit qualified as Fi
import Data.Foldable as F
import Data.List (genericLength)
import Data.List qualified as L
import Data.Map.Strict qualified as M
import Data.Maybe qualified as My
import Data.Numbers.Primes
import Data.Ord
import Data.Ratio
import Data.Serialize qualified as C
import GHC.Generics (Generic)
import GHC.Natural (Natural)
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
-- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer)
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic, C.Serialize)
toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats
toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList)
where
asList = M.toList $ m'
sum' =
sum
. map fromIntegral
. M.elems
$ m
m' = M.map ((% sum') . fromIntegral) m
maxBound' :: Integer
maxBound' = fromIntegral (maxBound :: a)
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b)
twoByteMarkov xs =
M.map sizeAsFraction
. M.fromListWith (M.unionWith (+))
. zip xs
. map (`M.singleton` (1 :: Integer))
. tail
$ xs
where
sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m
where
sum' = sum . M.elems $ m
toInteger :: (Integral a) => a -> Integer
toInteger = fromIntegral
maxBound' = fromIntegral (maxBound :: b)
data Compressed a = Compressed
{ markovs :: M.Map a (M.Map a (Word8)),
location :: Ratio Integer,
start :: a,
length :: Int
}
deriving (Eq, Ord, Show, Generic, C.Serialize)
decompress ::
forall a.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
Compressed a ->
[a]
decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start)
where
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal)
where
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
sanityCheck = map (sum . M.elems) . M.elems
compress ::
forall a.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) =>
[a] ->
Compressed a
compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress)
where
twoByteMarkovs = twoByteMarkov toCompress
rings = M.map toRing twoByteMarkovs
pairs = zip toCompress . tail $ toCompress
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
addWordMarkovStats
(WordMarkovStats {location = prevLoc, size = prevSize})
(WordMarkovStats {location = nextLoc, size = nextSize}) =
WordMarkovStats
{ location = prevLoc + (prevSize * nextLoc),
size = prevSize * nextSize
}
statsFor (x0, x1) = (rings M.! x0) M.! x1
pyramidFold f = pyramid
where
pyramid [x] = x
pyramid xs = pyramid $ pyramidFold' xs
pyramidFold' [x] = [x]
pyramidFold' (x0 : x1 : []) = [f x0 x1]
pyramidFold' (x0 : x1 : xs) = (f x0 x1) : (pyramidFold' xs)
-- borrowed and slightly changed from Data.Ratio source
simplestBetween :: Rational -> Rational -> Rational
simplestBetween x y
| x == y = x
| x > 0 = simplestBetween' n d n' d'
| otherwise = 0 % 1
where
n = numerator x
d = denominator x
n' = numerator y
d' = denominator y
simplestBetween' n d n' d' -- assumes 0 < n%d < n'%d'
| r == 0 = q % 1
| q /= q' = (q + 1) % 1
| otherwise = (q * n'' + d'') % n''
where
(q, r) = quotRem n d
(q', r') = quotRem n' d'
nd'' = simplestBetween' d' r' d r
n'' = numerator nd''
d'' = denominator nd''

125
src/Compress/Huffman.hs Normal file
View file

@ -0,0 +1,125 @@
module Compress.Huffman where
import Data.Bifunctor qualified as Bi
import Data.FiniteBit
import Data.Bit (cloneToByteString)
import Data.Bit qualified as B
import Data.Bit qualified as BV
import Data.ByteString (fromFilePath)
import Data.ByteString qualified as BS
import Data.Data qualified as D
import Data.Foldable qualified as F
import Data.IntMap.Strict qualified as IM
import qualified Data.Proxy as D
import qualified Basement.From as F
import Data.Map.Strict qualified as M
import Basement.Bits as B
import Data.Maybe (fromMaybe)
import Data.Maybe qualified as My
import Data.PQueue qualified as PQ
import Data.Serialize qualified as C
import Data.Text qualified as T
import Data.Text.IO qualified as TIO
import Data.Vector.Unboxed qualified as V
import Data.Word
import Debug.Trace qualified as D
import GHC.Generics (Generic)
import Options.Generic qualified as O
import System.Environment qualified as SE
import Data.HuffmanTree
decompress
:: forall a
. (Ord a, Integral a, B.FiniteBitsOps a)
=> (TreeDirs, HuffmanTree a)
-> Maybe BS.ByteString
decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompress' treeDirs
where
decompress' :: [TreeDir] -> Maybe [a]
decompress' [] = Just []
decompress' xs = case nextLeaf xs tree of
Nothing -> Nothing
Just (x, remainingDirs) -> (x :) <$> decompress' remainingDirs
nextLeaf :: [TreeDir] -> HuffmanTree a -> Maybe (a, [TreeDir])
nextLeaf xs (Leaf a) = Just (a, xs)
nextLeaf [] _ = Nothing
nextLeaf (L : xs) (Node {..}) = nextLeaf xs left
nextLeaf (R : xs) (Node {..}) = nextLeaf xs right
compress
:: forall a
. (Ord a, Integral a, B.FiniteBitsOps a, B.BitOps a)
=> BS.ByteString
-> Maybe (TreeDirs, HuffmanTree a)
compress bs =
liftA2 (,) (TreeDirs <$> treeDirections) mergedHuffmanTrees
where
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
mergedHuffmanTrees =
mergeHuffmanTrees
. PQ.fromList
. map (uncurry (flip (,)) . Bi.first Leaf)
. counts
$ dividedByteString
treeDirMap :: M.Map a [TreeDir]
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
dividedByteString = toWordsList bs
-- testCompression
-- :: forall a
-- . (Ord a, Eq a, Integral a, B.FiniteBitsOps a, B.BitOps a, C.Serialize a)
-- => D.Proxy a
-- -> BS.ByteString
-- -> Bool
-- testCompression _ bs =
-- ((Right . Just $ bs) ==)
-- . Bi.second (decompress :: Maybe (TreeDirs, HuffmanTree a) -> Maybe BS.ByteString)
-- -- . D.traceShowWith (Bi.second (fmap fst))
-- . (decodeCompressed :: BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a)))
-- . encodeCompressed
-- -- . D.traceShowWith (fmap fst)
-- . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a))
-- $ bs
-- cloneToByteStringWithLen :: V.Vector BV.Bit -> (BS.ByteString, Int)
-- cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec)
-- cloneFromByteStringWithLen :: (BS.ByteString, Int) -> V.Vector BV.Bit
-- cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs
-- decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir]
-- decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen
-- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a))
-- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
counts :: (Ord a) => [a] -> [(a, Int)]
counts = M.toList . F.foldl' combiningInsert M.empty
where
combiningInsert m key = M.insertWith (+) key 1 m
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
divideByteString n [] = []
divideByteString n bs = x : divideByteString n xs
where
(x, xs) = BS.splitAt n bs
compressionRatioFor
:: forall a
. (Integral a, B.FiniteBitsOps a, B.BitOps a, Ord a, C.Serialize a)
=> D.Proxy a
-> BS.ByteString
-> Double
compressionRatioFor proxy bs =
(/ (fromIntegral . BS.length $ bs))
. fromIntegral
. BS.length
. C.encode
. (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a))
$ bs

188
src/Compress/PrefixTree.hs Normal file
View file

@ -0,0 +1,188 @@
module Compress.PrefixTree where
import Basement.Bits qualified as B
import qualified Basement.From as F
import Compress.Huffman qualified as H
import Control.Applicative qualified as A
import Data.Bifunctor qualified as Bi
import Data.ByteString qualified as BS
import Data.Foldable qualified as F
import Data.HuffmanTree as HT
import Data.List qualified as L
import Data.List.NonEmpty qualified as NE
import Data.Map.Strict qualified as M
import Data.Maybe qualified as My
import Data.Ord qualified as O
import Data.PQueue qualified as PQ
import Debug.Trace qualified as D
import Debug.Trace qualified as T
import Basement.Bits (FiniteBitsOps(numberOfBits))
import GHC.Generics
import qualified Data.Serialize as C
import Data.FiniteBit
data Tree a = (Ord a) =>
Tree
{ children :: M.Map a (Tree a)
}
newtype HuffmanPrefixTree a b = HuffmanPrefixTree
{ inner :: M.Map a (HuffmanTree b)
} deriving (Eq, Ord, Show, Generic, C.Serialize)
finiteBitTupleUncons ::
forall a b.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
BS.ByteString ->
Maybe ((a, b), BS.ByteString)
finiteBitTupleUncons bs = case finiteBitUncons bs of
Just (a, bs') -> case finiteBitUncons bs' of
Just (b, _) -> Just ((a, b), bs')
_ -> Nothing
_ -> Nothing
fromByteString ::
forall a b.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
BS.ByteString ->
[(a, b)]
fromByteString bs = case finiteBitTupleUncons bs of
Just ((a, b), bs') -> (a, b) : fromByteString bs'
Nothing -> []
toHuffmanTree ::
forall a b.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
M.Map (a, b) Word ->
HuffmanPrefixTree a b
toHuffmanTree =
HuffmanPrefixTree
. M.mapMaybe HT.fromList
. M.fromListWith (++)
. map (\((a, b), count) -> (a, [(fromIntegral count, b)]))
. M.assocs
decompress ::
forall a .
(Integral a, B.FiniteBitsOps a, B.BitOps a) =>
(TreeDirs, HuffmanPrefixTree a a, a)
-> Maybe BS.ByteString
decompress (TreeDirs treeDirs'', HuffmanPrefixTree prefixTree, initial') = BS.concat . map toByteString . (initial' :) <$> decompress' treeDirs'' initial'
where
decompress' :: [TreeDir] -> a -> Maybe [a]
decompress' treeDirs initial = case HT.lookup (prefixTree M.! initial) treeDirs of
Nothing -> Nothing
Just (ans, []) -> Just [ans]
Just (ans, treeDirs') -> (ans :) <$> decompress' treeDirs' ans
compress ::
forall a b.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a b, a)
compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> initial
where
tree :: HuffmanPrefixTree a b
tree = toHuffmanTree . nGramCounts $ bs
treeDirMap :: M.Map a (M.Map b [TreeDir])
treeDirMap = M.map HT.findTreeDirections . Compress.PrefixTree.inner $ tree
initial :: Maybe a
initial = fst <$> finiteBitUncons bs
asFiniteBitPairs :: [(a,b)]
asFiniteBitPairs = fromByteString bs
treeDirsFor :: (a, b) -> [TreeDir]
treeDirsFor (a, b) = (treeDirMap M.! a) M.! b
-- | all (M.null . children) . M.elems . children $ tree =
-- fmap End
-- . HT.fromList
-- . map (\x -> (prefixCounts x, x))
-- . M.keys
-- . children
-- $ tree
-- | otherwise =
-- Just
-- . Layer
-- . M.mapMaybeWithKey (\key val -> toHuffmanTree' (key : soFar) val)
-- . children
-- $ tree
-- where
-- prefixCounts :: a -> Int
-- prefixCounts x =
-- fromIntegral
-- . sum
-- . M.elems
-- . M.filterWithKey (\key val -> L.isPrefixOf (reverse . (x :) $ soFar) key)
-- $ nGrams
-- toHuffmanTree :: Tree a -> p1 -> HuffmanTree a
-- toHuffmanTree :: forall a . Tree a -> M.Map [a] Word -> HuffmanTree [a]
-- toHuffmanTree (Tree {..}) nGrams soFar | M.size children == 1 = Leaf . map (reverse . (: soFar)) . M.keys $ children
-- toHuffmanTree (Tree {..}) nGrams soFar = Leaf . map (reverse . (: soFar)) . M.keys $ children
-- where
-- sorted = L.sortBy (prefixCounts . fst) . M.toList $ children
nGramCounts ::
forall a b.
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
BS.ByteString ->
M.Map (a, b) Word
nGramCounts =
M.fromListWith (+)
. map (,1)
. My.mapMaybe (My.listToMaybe . fromByteString)
. takeWhile ((== len) . BS.length)
. map (BS.take len)
. BS.tails
where
len = (`div` 8) . F.from $ numberOfBits (0 :: a) + numberOfBits (0 :: b)
empty :: (Ord a) => Tree a
empty = Tree M.empty
singleton :: (Ord a) => a -> Tree a
singleton x = Tree $ M.singleton x empty
fromSingleList :: (Ord a) => [a] -> Tree a
fromSingleList [] = empty
fromSingleList (x : xs) = Tree . M.singleton x . fromSingleList $ xs
fromList :: (Ord a) => [[a]] -> Tree a
fromList = F.foldl' merge empty . map fromSingleList
-- insert :: Ord a => Tree a -> [a] -> Tree a
-- insert (Tree {..}) (x:xs) =
merge :: Tree a -> Tree a -> Tree a
merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1
-- deriving instance Eq (Tree a)
-- deriving instance Ord (Tree a)
-- deriving instance (Show a) => Show (Tree a)
-- empty :: (Ord a) => Tree a
-- empty = Tree M.empty
-- fromList :: (Ord a, F.Foldable t) => t [a] -> Tree a
-- fromList = F.foldl' insert empty
-- insert :: Tree a -> [a] -> Tree a
-- insert (Tree {..}) [] = Tree M.empty
-- insert (Tree {..}) (x : xs) =
-- Tree
-- . flip (M.insert x) children
-- . flip insert xs
-- . My.fromMaybe empty
-- . M.lookup x
-- $ children
-- lookup :: (Ord a) => [a] -> Tree a -> Bool
-- lookup [] = const True
-- lookup (x : xs) = maybe False (Compress.PrefixTree.lookup xs) . M.lookup x . children

45
src/Data/FiniteBit.hs Normal file
View file

@ -0,0 +1,45 @@
module Data.FiniteBit where
import Data.Bit (cloneToByteString)
import Data.Bit qualified as B
import Data.Bit qualified as BV
import Data.ByteString qualified as BS
import qualified Data.Proxy as D
import qualified Basement.From as F
import Data.Foldable qualified as F
import Data.Word
import Basement.Bits as B
numBytesIn :: forall a. (B.FiniteBitsOps a, Integral a) => D.Proxy a -> Int
numBytesIn _ = (`div` 8) . F.from . B.numberOfBits $ (0 :: a)
toWordsList :: forall a. (Integral a, B.FiniteBitsOps a, B.BitOps a) => BS.ByteString -> [a]
toWordsList bs = case finiteBitUncons bs of
Nothing -> []
(Just (x, xs)) -> x : toWordsList xs
toByteString :: forall a. (Integral a, B.FiniteBitsOps a) => a -> BS.ByteString
toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..]
where
numBytes = numBytesIn (D.Proxy :: D.Proxy a)
finiteBitUncons
:: forall a
. (Integral a, B.FiniteBitsOps a, B.BitOps a)
=> BS.ByteString
-> Maybe (a, BS.ByteString)
finiteBitUncons [] = Nothing
finiteBitUncons bs =
Just
. (,rest)
. F.foldl' (.|.) 0
. zipWith (flip B.rotateR) [8, 16 ..]
. map (fromIntegral :: Word8 -> a)
. BS.unpack
$ takenBytes
where
takenBytes :: BS.ByteString
rest :: BS.ByteString
(takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs

67
src/Data/HuffmanTree.hs Normal file
View file

@ -0,0 +1,67 @@
module Data.HuffmanTree where
import Data.Bifunctor qualified as Bi
import Data.Map.Strict qualified as M
import Data.PQueue qualified as PQ
import Data.Serialize qualified as C
import GHC.Generics (Generic)
import Data.Vector.Unboxed qualified as V
import Data.Bit qualified as B
import Data.Bit qualified as BV
import Data.ByteString qualified as BS
data HuffmanTree a
= Leaf a
| Node
{ left :: HuffmanTree a,
right :: HuffmanTree a
}
deriving (Eq, Ord, Show, Generic, C.Serialize, Functor)
-- here so we can define our own Serialize instance
newtype TreeDirs = TreeDirs {
inner :: [TreeDir]
} deriving (Eq, Ord, Show)
data TreeDir = L | R deriving (Eq, Ord, Show)
instance C.Serialize TreeDirs where
put :: C.Putter TreeDirs
put = C.put . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) . (inner :: TreeDirs -> [TreeDir])
get :: C.Get TreeDirs
get = do
(bs, len) <- C.get
pure . TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
lookup ::
forall a.
(Ord a) =>
HuffmanTree a ->
[TreeDir] ->
Maybe (a, [TreeDir])
lookup (Node {..}) (L : xs) = Data.HuffmanTree.lookup left xs
lookup (Node {..}) (R : xs) = Data.HuffmanTree.lookup right xs
lookup (Leaf a) xs = Just (a, xs)
lookup _ [] = Nothing
findTreeDirections ::
forall a.
(Ord a) =>
HuffmanTree a ->
M.Map a [TreeDir]
findTreeDirections (Leaf a) = M.singleton a []
findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right)
where
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
rec' dir = M.map (dir :) . findTreeDirections
mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a)
mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of
Nothing -> Nothing
Just (size, (x, Nothing)) -> Just x
Just (size, (x, Just (size1, (x', rest)))) -> mergeHuffmanTrees $ PQ.insert (size + size1) (Node x x') rest
fromList :: [(Int, a)] -> Maybe (HuffmanTree a)
fromList = mergeHuffmanTrees . PQ.fromList . map (Bi.second Leaf)

View file

@ -16,7 +16,10 @@ fromList = PQueue . IM.fromListWith NE.append . map (Bi.second NE.singleton)
singleton :: Int -> a -> PQueue a
singleton key a = PQueue $ IM.singleton key [a]
abstractView :: (IM.IntMap (NE.NonEmpty a) -> Maybe (IM.Key, NE.NonEmpty a)) -> PQueue a -> Maybe (IM.Key, (a, PQueue a))
abstractView
:: (IM.IntMap (NE.NonEmpty a) -> Maybe (IM.Key, NE.NonEmpty a))
-> PQueue a
-> Maybe (IM.Key, (a, PQueue a))
abstractView f (PQueue m) = case f m of
Nothing -> Nothing
(Just (key, x NE.:| (x' : xs))) -> Just (key, (x, PQueue $ IM.insert key (x' NE.:| xs) m))

View file

@ -1,182 +1,67 @@
module Main where
import Basement.Bits qualified as B
import Compress.Huffman
import Compress.Huffman qualified as HT
import Compress.PrefixTree (HuffmanPrefixTree (HuffmanPrefixTree))
import Compress.PrefixTree qualified as PT
import Compress.Arithmetic qualified as A
import Data.Bifunctor qualified as Bi
import Data.Bit (cloneToByteString)
import Data.Bit qualified as B
import Data.Bit qualified as BV
import Data.Bits (Bits ((.|.)))
import Data.Bits qualified as B
import Data.ByteString (fromFilePath)
import Data.ByteString qualified as BS
import Data.Data qualified as D
import Data.Foldable qualified as F
import Data.IntMap.Strict qualified as IM
import Data.Map.Strict qualified as M
import Data.Maybe (fromMaybe)
import Data.Maybe qualified as My
import Data.PQueue qualified as PQ
import Data.HuffmanTree
import Data.Proxy qualified as P
import Data.Serialize qualified as C
import Data.Text qualified as T
import Data.Text.IO qualified as TIO
import Data.Vector.Unboxed qualified as V
import Data.Word
import Debug.Trace qualified as D
import GHC.Generics (Generic)
import Options.Generic qualified as O
import System.Environment qualified as SE
import qualified Data.FiniteBit as FB
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic)
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
instance O.ParseRecord CompressOrDecompress
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data HuffmanTree a
= Leaf a
| Node
{left :: HuffmanTree a, right :: HuffmanTree a}
deriving (Eq, Ord, Show, Generic, C.Serialize)
data CLIOpts = CLIOpts
{ task :: CompressOrDecompress,
strategy :: CompressionStrategy
}
deriving (Show, Generic, O.ParseRecord)
data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize)
applyCompressionOptions ::
forall a.
(Integral a, B.BitOps a, B.FiniteBitsOps a, Ord a, C.Serialize a) =>
P.Proxy a ->
CLIOpts ->
BS.ByteString ->
BS.ByteString
applyCompressionOptions _ (CLIOpts Compress Huffman) f =
C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir]
findTreeDirections (Leaf a) = M.singleton a []
findTreeDirections (Node {..}) = M.union (rec' L left) (rec' R right)
where
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
rec' dir = M.map (dir :) . findTreeDirections
decompress :: forall a. (Ord a, Integral a, B.FiniteBits a) => Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString
decompress Nothing = Just []
decompress (Just (treeDirs, tree)) = BS.concat . map toByteString <$> decompress' treeDirs
where
decompress' :: [TreeDir] -> Maybe [a]
decompress' [] = Just []
decompress' xs = case nextLeaf xs tree of
Nothing -> Nothing
Just (x, remainingDirs) -> (x :) <$> decompress' remainingDirs
nextLeaf :: [TreeDir] -> HuffmanTree a -> Maybe (a, [TreeDir])
nextLeaf xs (Leaf a) = Just (a, xs)
nextLeaf [] _ = Nothing
nextLeaf (L : xs) (Node {..}) = nextLeaf xs left
nextLeaf (R : xs) (Node {..}) = nextLeaf xs right
compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)
compress bs =
liftA2 (,) treeDirections mergedHuffmanTrees
where
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
mergedHuffmanTrees =
mergeHuffmanTrees
. PQ.fromList
. map (uncurry (flip (,)) . Bi.first Leaf)
. counts
$ dividedByteString
treeDirMap :: M.Map a [TreeDir]
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
dividedByteString = toBitsList bs
testCompression :: forall a. (Ord a, Eq a, Integral a, B.FiniteBits a, C.Serialize a) => D.Proxy a -> BS.ByteString -> Bool
testCompression _ bs =
((Right . Just $ bs) ==)
. Bi.second (decompress :: Maybe ([TreeDir], HuffmanTree a) -> Maybe BS.ByteString)
-- . D.traceShowWith (Bi.second (fmap fst))
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a)))
. encodeCompressed
-- . D.traceShowWith (fmap fst)
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
$ bs
encodeCompressed :: (C.Serialize a) => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
where
encodeTreeDirs = cloneToByteStringWithLen . V.fromList . map (BV.Bit . (== R))
cloneToByteStringWithLen :: V.Vector BV.Bit -> (BS.ByteString, Int)
cloneToByteStringWithLen vec = (BV.cloneToByteString vec, V.length vec)
cloneFromByteStringWithLen :: (BS.ByteString, Int) -> V.Vector BV.Bit
cloneFromByteStringWithLen (bs, len) = V.take len . BV.cloneFromByteString $ bs
decodeTreeDirs :: (BS.ByteString, Int) -> [TreeDir]
decodeTreeDirs = map (\x -> if BV.unBit x then R else L) . V.toList . cloneFromByteStringWithLen
decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBits a, C.Serialize a) => BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree a))
decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
mergeHuffmanTrees :: PQ.PQueue (HuffmanTree a) -> Maybe (HuffmanTree a)
mergeHuffmanTrees queue = case (Bi.second . Bi.second $ PQ.minView) <$> PQ.minView queue of
Nothing -> Nothing
Just (size, (x, Nothing)) -> Just x
Just (size, (x, Just (size1, (x', rest)))) -> mergeHuffmanTrees $ PQ.insert (size + size1) (Node x x') rest
counts :: (Ord a) => [a] -> [(a, Int)]
counts = M.toList . F.foldl' combiningInsert M.empty
where
combiningInsert m key = M.insertWith (+) key 1 m
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
divideByteString n [] = []
divideByteString n bs = x : divideByteString n xs
where
(x, xs) = BS.splitAt n bs
toBitsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a]
toBitsList bs = case finiteBitUncons bs of
Nothing -> []
(Just (x, xs)) -> x : (toBitsList xs)
toByteString :: forall a. (Integral a, B.FiniteBits a) => a -> BS.ByteString
toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..]
where
numBytes = numBytesIn (D.Proxy :: D.Proxy a)
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
finiteBitUncons [] = Nothing
finiteBitUncons bs =
Just
. (,rest)
. F.foldl' (.|.) 0
. zipWith (flip B.rotateR) [8, 16 ..]
. map (fromIntegral :: Word8 -> a)
. BS.unpack
$ takenBytes
where
takenBytes :: BS.ByteString
rest :: BS.ByteString
(takenBytes, rest) = BS.splitAt (numBytesIn (D.Proxy :: D.Proxy a)) bs
numBytesIn :: forall a. (B.FiniteBits a) => D.Proxy a -> Int
numBytesIn _ = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
compressionRatioFor
:: forall a
. (Integral a, B.FiniteBits a, Ord a, C.Serialize a)
=> D.Proxy a
-> BS.ByteString
-> Double
compressionRatioFor proxy bs =
(/ (fromIntegral . BS.length $ bs))
. fromIntegral
. BS.length
. encodeCompressed
. (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree a))
$ bs
applyCompressionOptions :: CompressOrDecompress -> BS.ByteString -> BS.ByteString
applyCompressionOptions Compress f = encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f
applyCompressionOptions Decompress f = case Bi.second decompress
. (decodeCompressed :: BS.ByteString -> Either String (Maybe ([TreeDir], HuffmanTree Word8)))
$ f of
-- TODO: write errors to stderr
(Left _) -> []
(Right Nothing) -> []
(Right (Just bs)) -> bs
handleError (Right (Just bs)) = bs
handleError _ = []
main :: IO ()
main = do
compresionOrDecompression :: CompressOrDecompress <- O.getRecord "compression/decompression"
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
f <- BS.getContents
BS.putStr . applyCompressionOptions compresionOrDecompression $ f
BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
-- let f = "hello tehre"
-- f <- BS.readFile "pg64317.txt"
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
-- print $ BS.length . C.encode $ compressed
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
-- BS.writeFile "outin.txt" decompressed
-- print (decompressed, f)
-- print $ BS.length decompressed
-- print $ BS.length f
-- print (decompressed == f)
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f