diff --git a/compress.cabal b/compress.cabal index 37e2dd7..c6669f9 100644 --- a/compress.cabal +++ b/compress.cabal @@ -75,7 +75,6 @@ common deps Compress.BurrowsWheeler Compress.Huffman Compress.PrefixTree - Compress.WordMarkovStats Compress.LengthDistancePairs Compress.MoveToFrontTransform Data.Dirs diff --git a/flake.lock b/flake.lock index 559a1d9..a5adea5 100644 --- a/flake.lock +++ b/flake.lock @@ -37,11 +37,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1756770412, - "narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=", + "lastModified": 1763759067, + "narHash": "sha256-LlLt2Jo/gMNYAwOgdRQBrsRoOz7BPRkzvNaI/fzXi2Q=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "4524271976b625a4a605beefd893f270620fd751", + "rev": "2cccadc7357c0ba201788ae99c4dfa90728ef5e0", "type": "github" }, "original": { @@ -67,11 +67,11 @@ }, "haskell-flake": { "locked": { - "lastModified": 1756607542, - "narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=", + "lastModified": 1763904090, + "narHash": "sha256-Mq99c4O6Ovx/JQgi6B6fnI9xR77nQ9+GcFTM2113nMI=", "owner": "srid", "repo": "haskell-flake", - "rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61", + "rev": "73c006f8e6531acab28bbde362ae8f13ee0ac72b", "type": "github" }, "original": { @@ -97,11 +97,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1756542300, - "narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=", + "lastModified": 1764517877, + "narHash": "sha256-pp3uT4hHijIC8JUK5MEqeAWmParJrgBVzHLNfJDZxg4=", "owner": "nixos", "repo": "nixpkgs", - "rev": "d7600c775f877cd87b4f5a831c28aa94137377aa", + "rev": "2d293cbfa5a793b4c50d17c05ef9e385b90edf6c", "type": "github" }, "original": { @@ -113,11 +113,11 @@ }, "nixpkgs-lib": { "locked": { - "lastModified": 1754788789, - "narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=", + "lastModified": 1761765539, + "narHash": "sha256-b0yj6kfvO8ApcSE+QmA6mUfu8IYG6/uU28OFn4PaC8M=", "owner": "nix-community", "repo": "nixpkgs.lib", - "rev": "a73b9c743612e4244d865a2fdee11865283c04e6", + "rev": "719359f4562934ae99f5443f20aa06c2ffff91fc", "type": "github" }, "original": { @@ -162,11 +162,11 @@ ] }, "locked": { - "lastModified": 1756662192, - "narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=", + "lastModified": 1762938485, + "narHash": "sha256-AlEObg0syDl+Spi4LsZIBrjw+snSVU4T8MOeuZJUJjM=", "owner": "numtide", "repo": "treefmt-nix", - "rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4", + "rev": "5b4ee75aeefd1e2d5a1cc43cf6ba65eba75e83e4", "type": "github" }, "original": { diff --git a/src/Compress/Arithmetic.hs b/src/Compress/Arithmetic.hs index 63a5dd3..9d1e087 100644 --- a/src/Compress/Arithmetic.hs +++ b/src/Compress/Arithmetic.hs @@ -2,131 +2,116 @@ -- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where module Compress.Arithmetic where -import Data.Bits qualified as B -import Data.Word -import Data.Bifunctor qualified as Bi +import Codec.Winery (WineryRecord) +import Codec.Winery qualified as C import Control.Arrow qualified as Ar import Control.Monad.Par (NFData) import Control.Monad.Par qualified as P +import Data.Bifunctor qualified as Bi +import Data.Bit qualified as BV +import Data.Bit qualified as VB +import Data.Bits qualified as B +import Data.ByteString qualified as BS import Data.ByteString qualified as By +import Data.Dirs import Data.FiniteBit qualified as Fi -import qualified Data.Foldable as F +import Data.Foldable qualified as F import Data.List (genericLength) import Data.List qualified as L import Data.Map.Strict qualified as M import Data.Maybe qualified as My import Data.Ord +-- import qualified Debug.Trace as D +import Data.Proxy qualified as P import Data.Ratio -import Codec.Winery qualified as C +import Data.Vector qualified as V +import Data.Vector.Unboxed qualified as VU +import Data.Word +import Data.Word4 (CompressWord4List, Word4, fromCompressedWord4List, toCompressedWord4List) import GHC.Generics (Generic) import GHC.Natural (Natural) -import qualified Relude.Unsafe as US -import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List) -import qualified Data.ByteString as BS -import qualified Relude.Unsafe as U -import Codec.Winery (WineryRecord) -import Compress.BurrowsWheeler (toBurrowsWheeler) -import qualified Data.Bit as BV -import qualified Data.Vector as V -import qualified Data.Bit as VB -import qualified Data.Vector.Unboxed as VU -import Data.Dirs --- import qualified Debug.Trace as D -import qualified Data.Proxy as P -import Compress.WordMarkovStats as WM +import Relude.Unsafe qualified as U +import Relude.Unsafe qualified as US -- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes -- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer) -- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums - toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats where - wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized maxBound' :: Integer maxBound' = fromIntegral (maxBound :: a) - resize xs = map ( / sum' ) xs + resize xs = map (/ sum') xs where sum' = sum xs withBumpedZeroSized :: [Rational] - withBumpedZeroSized - = resize - . map (max minimumSize) - . resize - . map (fromIntegral . snd) - $ xs + withBumpedZeroSized = + resize + . map (max minimumSize) + . resize + . map (fromIntegral . snd) + $ xs where - -- | we have to round the 0s up to something - -- | we somewhat arbitrarily choose half of the smallest - -- | normally representable amount + -- \| we have to round the 0s up to something + -- \| we somewhat arbitrarily choose half of the smallest + -- \| normally representable amount minimumSize :: Rational minimumSize = 1 % (maxBound' * 2) - - f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum) where newFrac = currFrac * (1 - runningSum) -relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)] +relativeCounts :: forall k. (Ord k) => [(k, Integer)] -> [(k, Rational)] relativeCounts m = map (Bi.second asFractionOfSum) m - where - asFractionOfSum = ( % sum') . fromIntegral + where + asFractionOfSum = (% sum') . fromIntegral - sum' = fromIntegral . F.sum . map snd $ m + sum' = fromIntegral . F.sum . map snd $ m - - -maxBound' :: forall a b . (Num b, Bounded a, Integral a) => P.Proxy a -> b +maxBound' :: forall a b. (Num b, Bounded a, Integral a) => P.Proxy a -> b maxBound' p = fromIntegral (maxBound :: a) -twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)] -twoByteMarkov xs = - M.map sizeAsFraction - . M.fromListWith (M.unionWith (+)) - . zip xs - . map (`M.singleton` (1 :: Integer)) - . US.tail +listToRelativeCounts :: forall k. (Ord k) => [k] -> [(k, Rational)] +listToRelativeCounts xs = + relativeCounts + . M.assocs + . M.fromListWith (+) + . map (,1) $ xs - where - - sizeAsFraction :: M.Map k Integer -> [(k, Rational)] - sizeAsFraction = relativeCounts . M.assocs data Chunk a = Chunk { location :: SerializedDirs, - start :: a, length :: Word64 } deriving (Eq, Ord, Show, Generic, P.NFData) - deriving C.Serialise via C.WineryVariant (Chunk a) + deriving (C.Serialise) via C.WineryVariant (Chunk a) -- maxBound = maxBound' (P.Proxy :: Proxy b) -discretizeFraction :: forall b . (Bounded b, Integral b) => Rational -> b +discretizeFraction :: forall b. (Bounded b, Integral b) => Rational -> b discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b))) where maxBound'' = maxBound' (P.Proxy :: P.Proxy b) - type Word4MapSerialized a = M.Map a ([a], CompressWord4List) -type Word4Map a = M.Map a [(a, Word8)] +type Word4Map a = M.Map a [(a, Word8)] -- talkAboutCompressed (Compressed {..}) = ( -- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs), -- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs), - -- ("chunks", BS.length . C.serialiseOnly $ chunks)) +-- ("chunks", BS.length . C.serialiseOnly $ chunks)) -newtype Prediction = Prediction { - chances :: [(Word8, Rational)] - } +newtype Prediction = Prediction + { chances :: [(Word8, Rational)] + } class CompressionModel a where createModel :: [Word8] -> a @@ -136,7 +121,7 @@ class CompressionModel a where -- at some point write -- instance IsoSerialisable a b => C.Serialise b -class C.Serialise b => IsoSerialisable a b where +class (C.Serialise b) => IsoSerialisable a b where toSerialisable :: a -> b fromSerialisable :: b -> a @@ -145,14 +130,14 @@ class C.Serialise b => IsoSerialisable a b where -- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData) -- deriving C.Serialise via C.WineryRecord BetterCompressed -data Compressed a mapImplementation = Compressed - { markovs :: mapImplementation, +data Compressed a = Compressed + { + -- needs a better name than markovs + frequencies :: [(a, Word8)], -- we'll paramaterize this at some point::168 chunks :: [Chunk a] } deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData) - deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation) - - + deriving (C.Serialise) via C.WineryVariant (Compressed a) -- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a -- unserealizeWord4Map = M.map unserializedIndividualMap @@ -162,39 +147,47 @@ data Compressed a mapImplementation = Compressed -- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes)) +data WordMarkovStats = WordMarkovStats + { location :: Ratio Integer, + size :: Ratio Integer + } + deriving (Show, Eq, Ord, Generic) -- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a -- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip) decompress :: forall a. - (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) => - Compressed a (Word4Map a) -> + (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) => + Compressed a -> [a] -decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk - $ chunks +decompress (Compressed {..}) = + concat . P.runPar . P.parMap decompressChunk $ + chunks where - rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs' - - markovs' = -- unserealizeWord4Map - markovs - + rings :: M.Map Rational (a, Rational) + rings = M.fromList . map toDecompressionRing . M.assocs . toRing $ frequencies toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size)) decompressChunk :: Chunk a -> [a] - decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start) + decompressChunk (Chunk {..}) = + map snd + . take (fromIntegral length) + . US.tail + . L.iterate' (decompress' . fst) + $ (location', 0) -- (fromBS rawLocation, start) where - location' = deserialize location -- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) - decompress' (!loc, !prevWord) = -- traceShow (newSize) $ - ((loc - ansLoc) / newSize, newVal) + decompress' !loc = + -- traceShow (newSize) $ + ((loc - ansLoc) / newSize, newVal) where ansLoc :: Ratio Integer newVal :: a newSize :: Ratio Integer - (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord) + (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ rings sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8] sanityCheck = map (sum . M.elems) . M.elems @@ -210,7 +203,6 @@ chunk chunkSize = chunk' chunkLength :: Int chunkLength = 4096 - -- runPar = id -- parMap = map @@ -219,48 +211,35 @@ chunkLength = 4096 -- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) => -- [a] -> -- Compressed a (Word4MapSerialized a) -compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a) -compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap - (compressChunk) - $ unCompressedChunks +compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a +compress toCompress = + Compressed relativeFrequencies + . P.runPar + . P.parMap compressChunk + $ unCompressedChunks where - - toCompress = toCompress' -- toBurrowsWheeler toCompress' -- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs -- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) unCompressedChunks = chunk chunkLength toCompress - twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress + relativeFrequencies = map (Bi.second discretizeFraction) . listToRelativeCounts $ toCompress - rings = M.map toRing twoByteMarkovs + rings = toRing relativeFrequencies compressChunk toCompress = Chunk { location = shortestLocation endStats, - start = US.head toCompress, length = fromIntegral . L.length $ toCompress } where - pairs = zip toCompress . US.tail $ toCompress shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size) - -- shortestLocation (WordMarkovStats {..}) - -- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer - -- | otherwise = simplestBetween location $ upperBound - epsilon - -- where - -- simplestBetweenAnswer = simplestBetween location upperBound - -- upperBound = location + size + endStats = pyramidFold addWordMarkovStats . map statsFor $ toCompress - -- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size - -- -- | and should make for a fairly quick subtraction - -- epsilon = 1 % (denominator size) - - endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs - - statsFor (x0, x1) = (rings M.! x0) M.! x1 + statsFor = (rings M.!) addWordMarkovStats (WordMarkovStats {location = prevLoc, size = prevSize}) @@ -270,7 +249,6 @@ compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap size = prevSize * nextSize } - pyramidFold :: (a -> a -> a) -> [a] -> a pyramidFold f = pyramid where @@ -286,7 +264,7 @@ simplestBetween :: Rational -> Rational -> Rational simplestBetween x y | x == y = x | x > 0 = simplestBetween' n d n' d' - -- | x > 0 = simplestBetween' n d n' d' + -- \| x > 0 = simplestBetween' n d n' d' | otherwise = 0 % 1 where n = numerator x diff --git a/src/Compress/Huffman.hs b/src/Compress/Huffman.hs index 9e2b8c3..913e88b 100644 --- a/src/Compress/Huffman.hs +++ b/src/Compress/Huffman.hs @@ -30,7 +30,7 @@ import Options.Generic qualified as O import System.Environment qualified as SE import Data.HuffmanTree import Data.Word4 (CompressWord4List(xs)) -import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts) +import Codec.Winery qualified as C decompress :: forall a @@ -85,13 +85,13 @@ compress dividedByteString = treeDirMap :: M.Map a [TreeDir] treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees -compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b) -compressWithBurrowsWheeler bs - = liftA2 BurrowsWheelerCompressed - (compress . map fst $ withCounts) - (compress . map snd $ withCounts) - where - withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs +-- compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b) +-- compressWithBurrowsWheeler bs +-- = liftA2 BurrowsWheelerCompressed +-- (compress . map fst $ withCounts) +-- (compress . map snd $ withCounts) +-- where +-- withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs -- testCompression -- :: forall a @@ -136,18 +136,18 @@ divideByteString n bs = x : divideByteString n xs (x, xs) = BS.splitAt n bs -compressionRatioFor - :: forall a b - . (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a, - Integral b, Ord b, C.Serialise b, Bounded b) - => D.Proxy a - -> D.Proxy b - -> BS.ByteString - -> Double -compressionRatioFor _ _ bs = - (/ (fromIntegral . BS.length $ bs)) - . fromIntegral - . BS.length - . C.serialise - . (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b)) - $ bs +-- compressionRatioFor +-- :: forall a b +-- . (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a, +-- Integral b, Ord b, C.Serialise b, Bounded b) +-- => D.Proxy a +-- -> D.Proxy b +-- -> BS.ByteString +-- -> Double +-- compressionRatioFor _ _ bs = +-- (/ (fromIntegral . BS.length $ bs)) +-- . fromIntegral +-- . BS.length +-- . C.serialise +-- . (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b)) +-- $ bs diff --git a/src/Compress/WordMarkovStats.hs b/src/Compress/WordMarkovStats.hs deleted file mode 100644 index 24289d1..0000000 --- a/src/Compress/WordMarkovStats.hs +++ /dev/null @@ -1,8 +0,0 @@ -module Compress.WordMarkovStats where - - -data WordMarkovStats = WordMarkovStats - { location :: Ratio Integer, - size :: Ratio Integer - } - deriving (Show, Eq, Ord, Generic) diff --git a/test/Test.hs b/test/Test.hs index 3f4286f..d4b894b 100644 --- a/test/Test.hs +++ b/test/Test.hs @@ -1,7 +1,5 @@ -import Compress.Arithmetic (twoByteMarkov) import Compress.Arithmetic qualified as A import Compress.LengthDistancePairs as LDP -import Compress.WordMarkovStats as WM import Data.Bifunctor qualified as Bi import Data.ByteString qualified as BS import Data.Dirs as D @@ -28,24 +26,24 @@ tests toCompress = "falsify" [ testGroup "Arithmetic" - [ testCase "works" $ - A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8]) - @?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])], - testCase "relativeCounts works as expected with one param" $ - A.relativeCounts [(0, 30)] - @?= [(0, 1)], - testCase "relativeCounts works as expected with lots of params" $ - A.relativeCounts [(0, 30), (1, 20)] - @?= [(0, 30 % 50), (1, 20 % 50)], - testCase "toRing locs all less than 1" $ - assertBool "larger than one" $ - all (all ((<= 1) . (.location))) toCompressRing, - testCase "toRing sizes all add up to 1" $ - assertBool "larger than 1" $ - all ((== 1) . sum . map (.size)) toCompressRing, - testCase "toRing gives no zero sizes" $ - assertBool "== 0" $ - all (all ((/= 0) . (.size))) toCompressRing, + [ -- testCase "works" $ + -- A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8]) + -- @?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])], + -- testCase "relativeCounts works as expected with one param" $ + -- A.relativeCounts [(0, 30)] + -- @?= [(0, 1)], + -- testCase "relativeCounts works as expected with lots of params" $ + -- A.relativeCounts [(0, 30), (1, 20)] + -- @?= [(0, 30 % 50), (1, 20 % 50)], + -- testCase "toRing locs all less than 1" $ + -- assertBool "larger than one" $ + -- all (all ((<= 1) . (.location))) toCompressRing, + -- testCase "toRing sizes all add up to 1" $ + -- assertBool "larger than 1" $ + -- all ((== 1) . sum . map (.size)) toCompressRing, + -- testCase "toRing gives no zero sizes" $ + -- assertBool "== 0" $ + -- all (all ((/= 0) . (.size))) toCompressRing, F.testProperty "binary search" propBinarySearchWithinBounds, F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress)) ] @@ -55,12 +53,12 @@ tests toCompress = -- ] ] where - toCompressRing = - map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8))) - . M.elems - . A.twoByteMarkov - . BS.unpack - $ toCompress + -- toCompressRing = + -- map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8))) + -- . M.elems + -- . A.twoByteMarkov + -- . BS.unpack + -- $ toCompress wordMaxBound :: Integer wordMaxBound = fromIntegral (maxBound :: Word)