remove markov chain from arithmetic compression in preparition for move to front transform

This commit is contained in:
Jack Wines 2025-12-04 19:33:49 -08:00
parent 4a963eb383
commit 9f3c1d0168
No known key found for this signature in database
6 changed files with 146 additions and 179 deletions

View file

@ -75,7 +75,6 @@ common deps
Compress.BurrowsWheeler
Compress.Huffman
Compress.PrefixTree
Compress.WordMarkovStats
Compress.LengthDistancePairs
Compress.MoveToFrontTransform
Data.Dirs

30
flake.lock generated
View file

@ -37,11 +37,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1756770412,
"narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=",
"lastModified": 1763759067,
"narHash": "sha256-LlLt2Jo/gMNYAwOgdRQBrsRoOz7BPRkzvNaI/fzXi2Q=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "4524271976b625a4a605beefd893f270620fd751",
"rev": "2cccadc7357c0ba201788ae99c4dfa90728ef5e0",
"type": "github"
},
"original": {
@ -67,11 +67,11 @@
},
"haskell-flake": {
"locked": {
"lastModified": 1756607542,
"narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=",
"lastModified": 1763904090,
"narHash": "sha256-Mq99c4O6Ovx/JQgi6B6fnI9xR77nQ9+GcFTM2113nMI=",
"owner": "srid",
"repo": "haskell-flake",
"rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61",
"rev": "73c006f8e6531acab28bbde362ae8f13ee0ac72b",
"type": "github"
},
"original": {
@ -97,11 +97,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1756542300,
"narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=",
"lastModified": 1764517877,
"narHash": "sha256-pp3uT4hHijIC8JUK5MEqeAWmParJrgBVzHLNfJDZxg4=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "d7600c775f877cd87b4f5a831c28aa94137377aa",
"rev": "2d293cbfa5a793b4c50d17c05ef9e385b90edf6c",
"type": "github"
},
"original": {
@ -113,11 +113,11 @@
},
"nixpkgs-lib": {
"locked": {
"lastModified": 1754788789,
"narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=",
"lastModified": 1761765539,
"narHash": "sha256-b0yj6kfvO8ApcSE+QmA6mUfu8IYG6/uU28OFn4PaC8M=",
"owner": "nix-community",
"repo": "nixpkgs.lib",
"rev": "a73b9c743612e4244d865a2fdee11865283c04e6",
"rev": "719359f4562934ae99f5443f20aa06c2ffff91fc",
"type": "github"
},
"original": {
@ -162,11 +162,11 @@
]
},
"locked": {
"lastModified": 1756662192,
"narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=",
"lastModified": 1762938485,
"narHash": "sha256-AlEObg0syDl+Spi4LsZIBrjw+snSVU4T8MOeuZJUJjM=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4",
"rev": "5b4ee75aeefd1e2d5a1cc43cf6ba65eba75e83e4",
"type": "github"
},
"original": {

View file

@ -2,130 +2,115 @@
-- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
module Compress.Arithmetic where
import Data.Bits qualified as B
import Data.Word
import Data.Bifunctor qualified as Bi
import Codec.Winery (WineryRecord)
import Codec.Winery qualified as C
import Control.Arrow qualified as Ar
import Control.Monad.Par (NFData)
import Control.Monad.Par qualified as P
import Data.Bifunctor qualified as Bi
import Data.Bit qualified as BV
import Data.Bit qualified as VB
import Data.Bits qualified as B
import Data.ByteString qualified as BS
import Data.ByteString qualified as By
import Data.Dirs
import Data.FiniteBit qualified as Fi
import qualified Data.Foldable as F
import Data.Foldable qualified as F
import Data.List (genericLength)
import Data.List qualified as L
import Data.Map.Strict qualified as M
import Data.Maybe qualified as My
import Data.Ord
-- import qualified Debug.Trace as D
import Data.Proxy qualified as P
import Data.Ratio
import Codec.Winery qualified as C
import Data.Vector qualified as V
import Data.Vector.Unboxed qualified as VU
import Data.Word
import Data.Word4 (CompressWord4List, Word4, fromCompressedWord4List, toCompressedWord4List)
import GHC.Generics (Generic)
import GHC.Natural (Natural)
import qualified Relude.Unsafe as US
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
import qualified Data.ByteString as BS
import qualified Relude.Unsafe as U
import Codec.Winery (WineryRecord)
import Compress.BurrowsWheeler (toBurrowsWheeler)
import qualified Data.Bit as BV
import qualified Data.Vector as V
import qualified Data.Bit as VB
import qualified Data.Vector.Unboxed as VU
import Data.Dirs
-- import qualified Debug.Trace as D
import qualified Data.Proxy as P
import Compress.WordMarkovStats as WM
import Relude.Unsafe qualified as U
import Relude.Unsafe qualified as US
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
-- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer)
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
where
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
maxBound' :: Integer
maxBound' = fromIntegral (maxBound :: a)
resize xs = map ( / sum' ) xs
resize xs = map (/ sum') xs
where
sum' = sum xs
withBumpedZeroSized :: [Rational]
withBumpedZeroSized
= resize
withBumpedZeroSized =
resize
. map (max minimumSize)
. resize
. map (fromIntegral . snd)
$ xs
where
-- | we have to round the 0s up to something
-- | we somewhat arbitrarily choose half of the smallest
-- | normally representable amount
-- \| we have to round the 0s up to something
-- \| we somewhat arbitrarily choose half of the smallest
-- \| normally representable amount
minimumSize :: Rational
minimumSize = 1 % (maxBound' * 2)
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
where
newFrac = currFrac * (1 - runningSum)
relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)]
relativeCounts :: forall k. (Ord k) => [(k, Integer)] -> [(k, Rational)]
relativeCounts m = map (Bi.second asFractionOfSum) m
where
asFractionOfSum = ( % sum') . fromIntegral
asFractionOfSum = (% sum') . fromIntegral
sum' = fromIntegral . F.sum . map snd $ m
maxBound' :: forall a b . (Num b, Bounded a, Integral a) => P.Proxy a -> b
maxBound' :: forall a b. (Num b, Bounded a, Integral a) => P.Proxy a -> b
maxBound' p = fromIntegral (maxBound :: a)
twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)]
twoByteMarkov xs =
M.map sizeAsFraction
. M.fromListWith (M.unionWith (+))
. zip xs
. map (`M.singleton` (1 :: Integer))
. US.tail
listToRelativeCounts :: forall k. (Ord k) => [k] -> [(k, Rational)]
listToRelativeCounts xs =
relativeCounts
. M.assocs
. M.fromListWith (+)
. map (,1)
$ xs
where
sizeAsFraction :: M.Map k Integer -> [(k, Rational)]
sizeAsFraction = relativeCounts . M.assocs
data Chunk a = Chunk
{ location :: SerializedDirs,
start :: a,
length :: Word64
}
deriving (Eq, Ord, Show, Generic, P.NFData)
deriving C.Serialise via C.WineryVariant (Chunk a)
deriving (C.Serialise) via C.WineryVariant (Chunk a)
-- maxBound = maxBound' (P.Proxy :: Proxy b)
discretizeFraction :: forall b . (Bounded b, Integral b) => Rational -> b
discretizeFraction :: forall b. (Bounded b, Integral b) => Rational -> b
discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b)))
where
maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
type Word4Map a = M.Map a [(a, Word8)]
type Word4Map a = M.Map a [(a, Word8)]
-- talkAboutCompressed (Compressed {..}) = (
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
newtype Prediction = Prediction {
chances :: [(Word8, Rational)]
newtype Prediction = Prediction
{ chances :: [(Word8, Rational)]
}
class CompressionModel a where
@ -136,7 +121,7 @@ class CompressionModel a where
-- at some point write
-- instance IsoSerialisable a b => C.Serialise b
class C.Serialise b => IsoSerialisable a b where
class (C.Serialise b) => IsoSerialisable a b where
toSerialisable :: a -> b
fromSerialisable :: b -> a
@ -145,14 +130,14 @@ class C.Serialise b => IsoSerialisable a b where
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
-- deriving C.Serialise via C.WineryRecord BetterCompressed
data Compressed a mapImplementation = Compressed
{ markovs :: mapImplementation,
data Compressed a = Compressed
{
-- needs a better name than markovs
frequencies :: [(a, Word8)], -- we'll paramaterize this at some point::168
chunks :: [Chunk a]
}
deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation)
deriving (C.Serialise) via C.WineryVariant (Compressed a)
-- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
-- unserealizeWord4Map = M.map unserializedIndividualMap
@ -162,6 +147,11 @@ data Compressed a mapImplementation = Compressed
-- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic)
-- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
@ -169,32 +159,35 @@ data Compressed a mapImplementation = Compressed
decompress ::
forall a.
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
Compressed a (Word4Map a) ->
Compressed a ->
[a]
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk
$ chunks
decompress (Compressed {..}) =
concat . P.runPar . P.parMap decompressChunk $
chunks
where
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
markovs' = -- unserealizeWord4Map
markovs
rings :: M.Map Rational (a, Rational)
rings = M.fromList . map toDecompressionRing . M.assocs . toRing $ frequencies
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
decompressChunk :: Chunk a -> [a]
decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start)
decompressChunk (Chunk {..}) =
map snd
. take (fromIntegral length)
. US.tail
. L.iterate' (decompress' . fst)
$ (location', 0) -- (fromBS rawLocation, start)
where
location' = deserialize location
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (!loc, !prevWord) = -- traceShow (newSize) $
decompress' !loc =
-- traceShow (newSize) $
((loc - ansLoc) / newSize, newVal)
where
ansLoc :: Ratio Integer
newVal :: a
newSize :: Ratio Integer
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord)
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ rings
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
sanityCheck = map (sum . M.elems) . M.elems
@ -210,7 +203,6 @@ chunk chunkSize = chunk'
chunkLength :: Int
chunkLength = 4096
-- runPar = id
-- parMap = map
@ -219,48 +211,35 @@ chunkLength = 4096
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
-- [a] ->
-- Compressed a (Word4MapSerialized a)
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a)
compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
(compressChunk)
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a
compress toCompress =
Compressed relativeFrequencies
. P.runPar
. P.parMap compressChunk
$ unCompressedChunks
where
toCompress = toCompress' -- toBurrowsWheeler toCompress'
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
unCompressedChunks = chunk chunkLength toCompress
twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress
relativeFrequencies = map (Bi.second discretizeFraction) . listToRelativeCounts $ toCompress
rings = M.map toRing twoByteMarkovs
rings = toRing relativeFrequencies
compressChunk toCompress =
Chunk
{ location = shortestLocation endStats,
start = US.head toCompress,
length = fromIntegral . L.length $ toCompress
}
where
pairs = zip toCompress . US.tail $ toCompress
shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
-- shortestLocation (WordMarkovStats {..})
-- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer
-- | otherwise = simplestBetween location $ upperBound - epsilon
-- where
-- simplestBetweenAnswer = simplestBetween location upperBound
-- upperBound = location + size
endStats = pyramidFold addWordMarkovStats . map statsFor $ toCompress
-- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size
-- -- | and should make for a fairly quick subtraction
-- epsilon = 1 % (denominator size)
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
statsFor (x0, x1) = (rings M.! x0) M.! x1
statsFor = (rings M.!)
addWordMarkovStats
(WordMarkovStats {location = prevLoc, size = prevSize})
@ -270,7 +249,6 @@ compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
size = prevSize * nextSize
}
pyramidFold :: (a -> a -> a) -> [a] -> a
pyramidFold f = pyramid
where
@ -286,7 +264,7 @@ simplestBetween :: Rational -> Rational -> Rational
simplestBetween x y
| x == y = x
| x > 0 = simplestBetween' n d n' d'
-- | x > 0 = simplestBetween' n d n' d'
-- \| x > 0 = simplestBetween' n d n' d'
| otherwise = 0 % 1
where
n = numerator x

View file

@ -30,7 +30,7 @@ import Options.Generic qualified as O
import System.Environment qualified as SE
import Data.HuffmanTree
import Data.Word4 (CompressWord4List(xs))
import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts)
import Codec.Winery qualified as C
decompress
:: forall a
@ -85,13 +85,13 @@ compress dividedByteString =
treeDirMap :: M.Map a [TreeDir]
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
compressWithBurrowsWheeler bs
= liftA2 BurrowsWheelerCompressed
(compress . map fst $ withCounts)
(compress . map snd $ withCounts)
where
withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
-- compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
-- compressWithBurrowsWheeler bs
-- = liftA2 BurrowsWheelerCompressed
-- (compress . map fst $ withCounts)
-- (compress . map snd $ withCounts)
-- where
-- withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
-- testCompression
-- :: forall a
@ -136,18 +136,18 @@ divideByteString n bs = x : divideByteString n xs
(x, xs) = BS.splitAt n bs
compressionRatioFor
:: forall a b
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
Integral b, Ord b, C.Serialise b, Bounded b)
=> D.Proxy a
-> D.Proxy b
-> BS.ByteString
-> Double
compressionRatioFor _ _ bs =
(/ (fromIntegral . BS.length $ bs))
. fromIntegral
. BS.length
. C.serialise
. (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
$ bs
-- compressionRatioFor
-- :: forall a b
-- . (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
-- Integral b, Ord b, C.Serialise b, Bounded b)
-- => D.Proxy a
-- -> D.Proxy b
-- -> BS.ByteString
-- -> Double
-- compressionRatioFor _ _ bs =
-- (/ (fromIntegral . BS.length $ bs))
-- . fromIntegral
-- . BS.length
-- . C.serialise
-- . (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
-- $ bs

View file

@ -1,8 +0,0 @@
module Compress.WordMarkovStats where
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic)

View file

@ -1,7 +1,5 @@
import Compress.Arithmetic (twoByteMarkov)
import Compress.Arithmetic qualified as A
import Compress.LengthDistancePairs as LDP
import Compress.WordMarkovStats as WM
import Data.Bifunctor qualified as Bi
import Data.ByteString qualified as BS
import Data.Dirs as D
@ -28,24 +26,24 @@ tests toCompress =
"falsify"
[ testGroup
"Arithmetic"
[ testCase "works" $
A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
@?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
testCase "relativeCounts works as expected with one param" $
A.relativeCounts [(0, 30)]
@?= [(0, 1)],
testCase "relativeCounts works as expected with lots of params" $
A.relativeCounts [(0, 30), (1, 20)]
@?= [(0, 30 % 50), (1, 20 % 50)],
testCase "toRing locs all less than 1" $
assertBool "larger than one" $
all (all ((<= 1) . (.location))) toCompressRing,
testCase "toRing sizes all add up to 1" $
assertBool "larger than 1" $
all ((== 1) . sum . map (.size)) toCompressRing,
testCase "toRing gives no zero sizes" $
assertBool "== 0" $
all (all ((/= 0) . (.size))) toCompressRing,
[ -- testCase "works" $
-- A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
-- @?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
-- testCase "relativeCounts works as expected with one param" $
-- A.relativeCounts [(0, 30)]
-- @?= [(0, 1)],
-- testCase "relativeCounts works as expected with lots of params" $
-- A.relativeCounts [(0, 30), (1, 20)]
-- @?= [(0, 30 % 50), (1, 20 % 50)],
-- testCase "toRing locs all less than 1" $
-- assertBool "larger than one" $
-- all (all ((<= 1) . (.location))) toCompressRing,
-- testCase "toRing sizes all add up to 1" $
-- assertBool "larger than 1" $
-- all ((== 1) . sum . map (.size)) toCompressRing,
-- testCase "toRing gives no zero sizes" $
-- assertBool "== 0" $
-- all (all ((/= 0) . (.size))) toCompressRing,
F.testProperty "binary search" propBinarySearchWithinBounds,
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
]
@ -55,12 +53,12 @@ tests toCompress =
-- ]
]
where
toCompressRing =
map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
. M.elems
. A.twoByteMarkov
. BS.unpack
$ toCompress
-- toCompressRing =
-- map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
-- . M.elems
-- . A.twoByteMarkov
-- . BS.unpack
-- $ toCompress
wordMaxBound :: Integer
wordMaxBound = fromIntegral (maxBound :: Word)