remove markov chain from arithmetic compression in preparition for move to front transform

This commit is contained in:
Jack Wines 2025-12-04 19:33:49 -08:00
parent 4a963eb383
commit 9f3c1d0168
No known key found for this signature in database
6 changed files with 146 additions and 179 deletions

View file

@ -75,7 +75,6 @@ common deps
Compress.BurrowsWheeler Compress.BurrowsWheeler
Compress.Huffman Compress.Huffman
Compress.PrefixTree Compress.PrefixTree
Compress.WordMarkovStats
Compress.LengthDistancePairs Compress.LengthDistancePairs
Compress.MoveToFrontTransform Compress.MoveToFrontTransform
Data.Dirs Data.Dirs

30
flake.lock generated
View file

@ -37,11 +37,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1756770412, "lastModified": 1763759067,
"narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=", "narHash": "sha256-LlLt2Jo/gMNYAwOgdRQBrsRoOz7BPRkzvNaI/fzXi2Q=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "4524271976b625a4a605beefd893f270620fd751", "rev": "2cccadc7357c0ba201788ae99c4dfa90728ef5e0",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -67,11 +67,11 @@
}, },
"haskell-flake": { "haskell-flake": {
"locked": { "locked": {
"lastModified": 1756607542, "lastModified": 1763904090,
"narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=", "narHash": "sha256-Mq99c4O6Ovx/JQgi6B6fnI9xR77nQ9+GcFTM2113nMI=",
"owner": "srid", "owner": "srid",
"repo": "haskell-flake", "repo": "haskell-flake",
"rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61", "rev": "73c006f8e6531acab28bbde362ae8f13ee0ac72b",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -97,11 +97,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1756542300, "lastModified": 1764517877,
"narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=", "narHash": "sha256-pp3uT4hHijIC8JUK5MEqeAWmParJrgBVzHLNfJDZxg4=",
"owner": "nixos", "owner": "nixos",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "d7600c775f877cd87b4f5a831c28aa94137377aa", "rev": "2d293cbfa5a793b4c50d17c05ef9e385b90edf6c",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -113,11 +113,11 @@
}, },
"nixpkgs-lib": { "nixpkgs-lib": {
"locked": { "locked": {
"lastModified": 1754788789, "lastModified": 1761765539,
"narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=", "narHash": "sha256-b0yj6kfvO8ApcSE+QmA6mUfu8IYG6/uU28OFn4PaC8M=",
"owner": "nix-community", "owner": "nix-community",
"repo": "nixpkgs.lib", "repo": "nixpkgs.lib",
"rev": "a73b9c743612e4244d865a2fdee11865283c04e6", "rev": "719359f4562934ae99f5443f20aa06c2ffff91fc",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -162,11 +162,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1756662192, "lastModified": 1762938485,
"narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=", "narHash": "sha256-AlEObg0syDl+Spi4LsZIBrjw+snSVU4T8MOeuZJUJjM=",
"owner": "numtide", "owner": "numtide",
"repo": "treefmt-nix", "repo": "treefmt-nix",
"rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4", "rev": "5b4ee75aeefd1e2d5a1cc43cf6ba65eba75e83e4",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -2,49 +2,45 @@
-- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where -- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
module Compress.Arithmetic where module Compress.Arithmetic where
import Data.Bits qualified as B import Codec.Winery (WineryRecord)
import Data.Word import Codec.Winery qualified as C
import Data.Bifunctor qualified as Bi
import Control.Arrow qualified as Ar import Control.Arrow qualified as Ar
import Control.Monad.Par (NFData) import Control.Monad.Par (NFData)
import Control.Monad.Par qualified as P import Control.Monad.Par qualified as P
import Data.Bifunctor qualified as Bi
import Data.Bit qualified as BV
import Data.Bit qualified as VB
import Data.Bits qualified as B
import Data.ByteString qualified as BS
import Data.ByteString qualified as By import Data.ByteString qualified as By
import Data.Dirs
import Data.FiniteBit qualified as Fi import Data.FiniteBit qualified as Fi
import qualified Data.Foldable as F import Data.Foldable qualified as F
import Data.List (genericLength) import Data.List (genericLength)
import Data.List qualified as L import Data.List qualified as L
import Data.Map.Strict qualified as M import Data.Map.Strict qualified as M
import Data.Maybe qualified as My import Data.Maybe qualified as My
import Data.Ord import Data.Ord
-- import qualified Debug.Trace as D
import Data.Proxy qualified as P
import Data.Ratio import Data.Ratio
import Codec.Winery qualified as C import Data.Vector qualified as V
import Data.Vector.Unboxed qualified as VU
import Data.Word
import Data.Word4 (CompressWord4List, Word4, fromCompressedWord4List, toCompressedWord4List)
import GHC.Generics (Generic) import GHC.Generics (Generic)
import GHC.Natural (Natural) import GHC.Natural (Natural)
import qualified Relude.Unsafe as US import Relude.Unsafe qualified as U
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List) import Relude.Unsafe qualified as US
import qualified Data.ByteString as BS
import qualified Relude.Unsafe as U
import Codec.Winery (WineryRecord)
import Compress.BurrowsWheeler (toBurrowsWheeler)
import qualified Data.Bit as BV
import qualified Data.Vector as V
import qualified Data.Bit as VB
import qualified Data.Vector.Unboxed as VU
import Data.Dirs
-- import qualified Debug.Trace as D
import qualified Data.Proxy as P
import Compress.WordMarkovStats as WM
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes -- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
-- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer) -- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer)
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums -- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
where where
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
maxBound' :: Integer maxBound' :: Integer
@ -55,57 +51,47 @@ toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
sum' = sum xs sum' = sum xs
withBumpedZeroSized :: [Rational] withBumpedZeroSized :: [Rational]
withBumpedZeroSized withBumpedZeroSized =
= resize resize
. map (max minimumSize) . map (max minimumSize)
. resize . resize
. map (fromIntegral . snd) . map (fromIntegral . snd)
$ xs $ xs
where where
-- | we have to round the 0s up to something -- \| we have to round the 0s up to something
-- | we somewhat arbitrarily choose half of the smallest -- \| we somewhat arbitrarily choose half of the smallest
-- | normally representable amount -- \| normally representable amount
minimumSize :: Rational minimumSize :: Rational
minimumSize = 1 % (maxBound' * 2) minimumSize = 1 % (maxBound' * 2)
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum) f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
where where
newFrac = currFrac * (1 - runningSum) newFrac = currFrac * (1 - runningSum)
relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)] relativeCounts :: forall k. (Ord k) => [(k, Integer)] -> [(k, Rational)]
relativeCounts m = map (Bi.second asFractionOfSum) m relativeCounts m = map (Bi.second asFractionOfSum) m
where where
asFractionOfSum = (% sum') . fromIntegral asFractionOfSum = (% sum') . fromIntegral
sum' = fromIntegral . F.sum . map snd $ m sum' = fromIntegral . F.sum . map snd $ m
maxBound' :: forall a b. (Num b, Bounded a, Integral a) => P.Proxy a -> b maxBound' :: forall a b. (Num b, Bounded a, Integral a) => P.Proxy a -> b
maxBound' p = fromIntegral (maxBound :: a) maxBound' p = fromIntegral (maxBound :: a)
twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)] listToRelativeCounts :: forall k. (Ord k) => [k] -> [(k, Rational)]
twoByteMarkov xs = listToRelativeCounts xs =
M.map sizeAsFraction relativeCounts
. M.fromListWith (M.unionWith (+)) . M.assocs
. zip xs . M.fromListWith (+)
. map (`M.singleton` (1 :: Integer)) . map (,1)
. US.tail
$ xs $ xs
where
sizeAsFraction :: M.Map k Integer -> [(k, Rational)]
sizeAsFraction = relativeCounts . M.assocs
data Chunk a = Chunk data Chunk a = Chunk
{ location :: SerializedDirs, { location :: SerializedDirs,
start :: a,
length :: Word64 length :: Word64
} }
deriving (Eq, Ord, Show, Generic, P.NFData) deriving (Eq, Ord, Show, Generic, P.NFData)
deriving C.Serialise via C.WineryVariant (Chunk a) deriving (C.Serialise) via C.WineryVariant (Chunk a)
-- maxBound = maxBound' (P.Proxy :: Proxy b) -- maxBound = maxBound' (P.Proxy :: Proxy b)
@ -114,18 +100,17 @@ discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound
where where
maxBound'' = maxBound' (P.Proxy :: P.Proxy b) maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
type Word4MapSerialized a = M.Map a ([a], CompressWord4List) type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
type Word4Map a = M.Map a [(a, Word8)]
type Word4Map a = M.Map a [(a, Word8)]
-- talkAboutCompressed (Compressed {..}) = ( -- talkAboutCompressed (Compressed {..}) = (
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs), -- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs), -- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
-- ("chunks", BS.length . C.serialiseOnly $ chunks)) -- ("chunks", BS.length . C.serialiseOnly $ chunks))
newtype Prediction = Prediction { newtype Prediction = Prediction
chances :: [(Word8, Rational)] { chances :: [(Word8, Rational)]
} }
class CompressionModel a where class CompressionModel a where
@ -136,7 +121,7 @@ class CompressionModel a where
-- at some point write -- at some point write
-- instance IsoSerialisable a b => C.Serialise b -- instance IsoSerialisable a b => C.Serialise b
class C.Serialise b => IsoSerialisable a b where class (C.Serialise b) => IsoSerialisable a b where
toSerialisable :: a -> b toSerialisable :: a -> b
fromSerialisable :: b -> a fromSerialisable :: b -> a
@ -145,14 +130,14 @@ class C.Serialise b => IsoSerialisable a b where
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData) -- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
-- deriving C.Serialise via C.WineryRecord BetterCompressed -- deriving C.Serialise via C.WineryRecord BetterCompressed
data Compressed a mapImplementation = Compressed data Compressed a = Compressed
{ markovs :: mapImplementation, {
-- needs a better name than markovs
frequencies :: [(a, Word8)], -- we'll paramaterize this at some point::168
chunks :: [Chunk a] chunks :: [Chunk a]
} }
deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData) deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation) deriving (C.Serialise) via C.WineryVariant (Compressed a)
-- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a -- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
-- unserealizeWord4Map = M.map unserializedIndividualMap -- unserealizeWord4Map = M.map unserializedIndividualMap
@ -162,6 +147,11 @@ data Compressed a mapImplementation = Compressed
-- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes)) -- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic)
-- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a -- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip) -- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
@ -169,32 +159,35 @@ data Compressed a mapImplementation = Compressed
decompress :: decompress ::
forall a. forall a.
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) => (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
Compressed a (Word4Map a) -> Compressed a ->
[a] [a]
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk decompress (Compressed {..}) =
$ chunks concat . P.runPar . P.parMap decompressChunk $
chunks
where where
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs' rings :: M.Map Rational (a, Rational)
rings = M.fromList . map toDecompressionRing . M.assocs . toRing $ frequencies
markovs' = -- unserealizeWord4Map
markovs
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size)) toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
decompressChunk :: Chunk a -> [a] decompressChunk :: Chunk a -> [a]
decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start) decompressChunk (Chunk {..}) =
map snd
. take (fromIntegral length)
. US.tail
. L.iterate' (decompress' . fst)
$ (location', 0) -- (fromBS rawLocation, start)
where where
location' = deserialize location location' = deserialize location
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) -- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (!loc, !prevWord) = -- traceShow (newSize) $ decompress' !loc =
-- traceShow (newSize) $
((loc - ansLoc) / newSize, newVal) ((loc - ansLoc) / newSize, newVal)
where where
ansLoc :: Ratio Integer ansLoc :: Ratio Integer
newVal :: a newVal :: a
newSize :: Ratio Integer newSize :: Ratio Integer
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord) (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ rings
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8] sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
sanityCheck = map (sum . M.elems) . M.elems sanityCheck = map (sum . M.elems) . M.elems
@ -210,7 +203,6 @@ chunk chunkSize = chunk'
chunkLength :: Int chunkLength :: Int
chunkLength = 4096 chunkLength = 4096
-- runPar = id -- runPar = id
-- parMap = map -- parMap = map
@ -219,48 +211,35 @@ chunkLength = 4096
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) => -- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
-- [a] -> -- [a] ->
-- Compressed a (Word4MapSerialized a) -- Compressed a (Word4MapSerialized a)
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a) compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a
compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap compress toCompress =
(compressChunk) Compressed relativeFrequencies
. P.runPar
. P.parMap compressChunk
$ unCompressedChunks $ unCompressedChunks
where where
toCompress = toCompress' -- toBurrowsWheeler toCompress'
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs -- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) -- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
unCompressedChunks = chunk chunkLength toCompress unCompressedChunks = chunk chunkLength toCompress
twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress relativeFrequencies = map (Bi.second discretizeFraction) . listToRelativeCounts $ toCompress
rings = M.map toRing twoByteMarkovs rings = toRing relativeFrequencies
compressChunk toCompress = compressChunk toCompress =
Chunk Chunk
{ location = shortestLocation endStats, { location = shortestLocation endStats,
start = US.head toCompress,
length = fromIntegral . L.length $ toCompress length = fromIntegral . L.length $ toCompress
} }
where where
pairs = zip toCompress . US.tail $ toCompress
shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size) shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
-- shortestLocation (WordMarkovStats {..})
-- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer
-- | otherwise = simplestBetween location $ upperBound - epsilon
-- where
-- simplestBetweenAnswer = simplestBetween location upperBound
-- upperBound = location + size endStats = pyramidFold addWordMarkovStats . map statsFor $ toCompress
-- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size statsFor = (rings M.!)
-- -- | and should make for a fairly quick subtraction
-- epsilon = 1 % (denominator size)
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
statsFor (x0, x1) = (rings M.! x0) M.! x1
addWordMarkovStats addWordMarkovStats
(WordMarkovStats {location = prevLoc, size = prevSize}) (WordMarkovStats {location = prevLoc, size = prevSize})
@ -270,7 +249,6 @@ compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
size = prevSize * nextSize size = prevSize * nextSize
} }
pyramidFold :: (a -> a -> a) -> [a] -> a pyramidFold :: (a -> a -> a) -> [a] -> a
pyramidFold f = pyramid pyramidFold f = pyramid
where where
@ -286,7 +264,7 @@ simplestBetween :: Rational -> Rational -> Rational
simplestBetween x y simplestBetween x y
| x == y = x | x == y = x
| x > 0 = simplestBetween' n d n' d' | x > 0 = simplestBetween' n d n' d'
-- | x > 0 = simplestBetween' n d n' d' -- \| x > 0 = simplestBetween' n d n' d'
| otherwise = 0 % 1 | otherwise = 0 % 1
where where
n = numerator x n = numerator x

View file

@ -30,7 +30,7 @@ import Options.Generic qualified as O
import System.Environment qualified as SE import System.Environment qualified as SE
import Data.HuffmanTree import Data.HuffmanTree
import Data.Word4 (CompressWord4List(xs)) import Data.Word4 (CompressWord4List(xs))
import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts) import Codec.Winery qualified as C
decompress decompress
:: forall a :: forall a
@ -85,13 +85,13 @@ compress dividedByteString =
treeDirMap :: M.Map a [TreeDir] treeDirMap :: M.Map a [TreeDir]
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b) -- compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
compressWithBurrowsWheeler bs -- compressWithBurrowsWheeler bs
= liftA2 BurrowsWheelerCompressed -- = liftA2 BurrowsWheelerCompressed
(compress . map fst $ withCounts) -- (compress . map fst $ withCounts)
(compress . map snd $ withCounts) -- (compress . map snd $ withCounts)
where -- where
withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs -- withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
-- testCompression -- testCompression
-- :: forall a -- :: forall a
@ -136,18 +136,18 @@ divideByteString n bs = x : divideByteString n xs
(x, xs) = BS.splitAt n bs (x, xs) = BS.splitAt n bs
compressionRatioFor -- compressionRatioFor
:: forall a b -- :: forall a b
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a, -- . (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
Integral b, Ord b, C.Serialise b, Bounded b) -- Integral b, Ord b, C.Serialise b, Bounded b)
=> D.Proxy a -- => D.Proxy a
-> D.Proxy b -- -> D.Proxy b
-> BS.ByteString -- -> BS.ByteString
-> Double -- -> Double
compressionRatioFor _ _ bs = -- compressionRatioFor _ _ bs =
(/ (fromIntegral . BS.length $ bs)) -- (/ (fromIntegral . BS.length $ bs))
. fromIntegral -- . fromIntegral
. BS.length -- . BS.length
. C.serialise -- . C.serialise
. (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b)) -- . (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
$ bs -- $ bs

View file

@ -1,8 +0,0 @@
module Compress.WordMarkovStats where
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic)

View file

@ -1,7 +1,5 @@
import Compress.Arithmetic (twoByteMarkov)
import Compress.Arithmetic qualified as A import Compress.Arithmetic qualified as A
import Compress.LengthDistancePairs as LDP import Compress.LengthDistancePairs as LDP
import Compress.WordMarkovStats as WM
import Data.Bifunctor qualified as Bi import Data.Bifunctor qualified as Bi
import Data.ByteString qualified as BS import Data.ByteString qualified as BS
import Data.Dirs as D import Data.Dirs as D
@ -28,24 +26,24 @@ tests toCompress =
"falsify" "falsify"
[ testGroup [ testGroup
"Arithmetic" "Arithmetic"
[ testCase "works" $ [ -- testCase "works" $
A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8]) -- A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
@?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])], -- @?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
testCase "relativeCounts works as expected with one param" $ -- testCase "relativeCounts works as expected with one param" $
A.relativeCounts [(0, 30)] -- A.relativeCounts [(0, 30)]
@?= [(0, 1)], -- @?= [(0, 1)],
testCase "relativeCounts works as expected with lots of params" $ -- testCase "relativeCounts works as expected with lots of params" $
A.relativeCounts [(0, 30), (1, 20)] -- A.relativeCounts [(0, 30), (1, 20)]
@?= [(0, 30 % 50), (1, 20 % 50)], -- @?= [(0, 30 % 50), (1, 20 % 50)],
testCase "toRing locs all less than 1" $ -- testCase "toRing locs all less than 1" $
assertBool "larger than one" $ -- assertBool "larger than one" $
all (all ((<= 1) . (.location))) toCompressRing, -- all (all ((<= 1) . (.location))) toCompressRing,
testCase "toRing sizes all add up to 1" $ -- testCase "toRing sizes all add up to 1" $
assertBool "larger than 1" $ -- assertBool "larger than 1" $
all ((== 1) . sum . map (.size)) toCompressRing, -- all ((== 1) . sum . map (.size)) toCompressRing,
testCase "toRing gives no zero sizes" $ -- testCase "toRing gives no zero sizes" $
assertBool "== 0" $ -- assertBool "== 0" $
all (all ((/= 0) . (.size))) toCompressRing, -- all (all ((/= 0) . (.size))) toCompressRing,
F.testProperty "binary search" propBinarySearchWithinBounds, F.testProperty "binary search" propBinarySearchWithinBounds,
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress)) F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
] ]
@ -55,12 +53,12 @@ tests toCompress =
-- ] -- ]
] ]
where where
toCompressRing = -- toCompressRing =
map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8))) -- map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
. M.elems -- . M.elems
. A.twoByteMarkov -- . A.twoByteMarkov
. BS.unpack -- . BS.unpack
$ toCompress -- $ toCompress
wordMaxBound :: Integer wordMaxBound :: Integer
wordMaxBound = fromIntegral (maxBound :: Word) wordMaxBound = fromIntegral (maxBound :: Word)