remove markov chain from arithmetic compression in preparition for move to front transform
This commit is contained in:
parent
4a963eb383
commit
9f3c1d0168
6 changed files with 146 additions and 179 deletions
|
|
@ -75,7 +75,6 @@ common deps
|
||||||
Compress.BurrowsWheeler
|
Compress.BurrowsWheeler
|
||||||
Compress.Huffman
|
Compress.Huffman
|
||||||
Compress.PrefixTree
|
Compress.PrefixTree
|
||||||
Compress.WordMarkovStats
|
|
||||||
Compress.LengthDistancePairs
|
Compress.LengthDistancePairs
|
||||||
Compress.MoveToFrontTransform
|
Compress.MoveToFrontTransform
|
||||||
Data.Dirs
|
Data.Dirs
|
||||||
|
|
|
||||||
30
flake.lock
generated
30
flake.lock
generated
|
|
@ -37,11 +37,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1756770412,
|
"lastModified": 1763759067,
|
||||||
"narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=",
|
"narHash": "sha256-LlLt2Jo/gMNYAwOgdRQBrsRoOz7BPRkzvNaI/fzXi2Q=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "4524271976b625a4a605beefd893f270620fd751",
|
"rev": "2cccadc7357c0ba201788ae99c4dfa90728ef5e0",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -67,11 +67,11 @@
|
||||||
},
|
},
|
||||||
"haskell-flake": {
|
"haskell-flake": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1756607542,
|
"lastModified": 1763904090,
|
||||||
"narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=",
|
"narHash": "sha256-Mq99c4O6Ovx/JQgi6B6fnI9xR77nQ9+GcFTM2113nMI=",
|
||||||
"owner": "srid",
|
"owner": "srid",
|
||||||
"repo": "haskell-flake",
|
"repo": "haskell-flake",
|
||||||
"rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61",
|
"rev": "73c006f8e6531acab28bbde362ae8f13ee0ac72b",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -97,11 +97,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1756542300,
|
"lastModified": 1764517877,
|
||||||
"narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=",
|
"narHash": "sha256-pp3uT4hHijIC8JUK5MEqeAWmParJrgBVzHLNfJDZxg4=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "d7600c775f877cd87b4f5a831c28aa94137377aa",
|
"rev": "2d293cbfa5a793b4c50d17c05ef9e385b90edf6c",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -113,11 +113,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1754788789,
|
"lastModified": 1761765539,
|
||||||
"narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=",
|
"narHash": "sha256-b0yj6kfvO8ApcSE+QmA6mUfu8IYG6/uU28OFn4PaC8M=",
|
||||||
"owner": "nix-community",
|
"owner": "nix-community",
|
||||||
"repo": "nixpkgs.lib",
|
"repo": "nixpkgs.lib",
|
||||||
"rev": "a73b9c743612e4244d865a2fdee11865283c04e6",
|
"rev": "719359f4562934ae99f5443f20aa06c2ffff91fc",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -162,11 +162,11 @@
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1756662192,
|
"lastModified": 1762938485,
|
||||||
"narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=",
|
"narHash": "sha256-AlEObg0syDl+Spi4LsZIBrjw+snSVU4T8MOeuZJUJjM=",
|
||||||
"owner": "numtide",
|
"owner": "numtide",
|
||||||
"repo": "treefmt-nix",
|
"repo": "treefmt-nix",
|
||||||
"rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4",
|
"rev": "5b4ee75aeefd1e2d5a1cc43cf6ba65eba75e83e4",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
||||||
|
|
@ -2,131 +2,116 @@
|
||||||
-- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
|
-- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
|
||||||
module Compress.Arithmetic where
|
module Compress.Arithmetic where
|
||||||
|
|
||||||
import Data.Bits qualified as B
|
import Codec.Winery (WineryRecord)
|
||||||
import Data.Word
|
import Codec.Winery qualified as C
|
||||||
import Data.Bifunctor qualified as Bi
|
|
||||||
import Control.Arrow qualified as Ar
|
import Control.Arrow qualified as Ar
|
||||||
import Control.Monad.Par (NFData)
|
import Control.Monad.Par (NFData)
|
||||||
import Control.Monad.Par qualified as P
|
import Control.Monad.Par qualified as P
|
||||||
|
import Data.Bifunctor qualified as Bi
|
||||||
|
import Data.Bit qualified as BV
|
||||||
|
import Data.Bit qualified as VB
|
||||||
|
import Data.Bits qualified as B
|
||||||
|
import Data.ByteString qualified as BS
|
||||||
import Data.ByteString qualified as By
|
import Data.ByteString qualified as By
|
||||||
|
import Data.Dirs
|
||||||
import Data.FiniteBit qualified as Fi
|
import Data.FiniteBit qualified as Fi
|
||||||
import qualified Data.Foldable as F
|
import Data.Foldable qualified as F
|
||||||
import Data.List (genericLength)
|
import Data.List (genericLength)
|
||||||
import Data.List qualified as L
|
import Data.List qualified as L
|
||||||
import Data.Map.Strict qualified as M
|
import Data.Map.Strict qualified as M
|
||||||
import Data.Maybe qualified as My
|
import Data.Maybe qualified as My
|
||||||
import Data.Ord
|
import Data.Ord
|
||||||
|
-- import qualified Debug.Trace as D
|
||||||
|
import Data.Proxy qualified as P
|
||||||
import Data.Ratio
|
import Data.Ratio
|
||||||
import Codec.Winery qualified as C
|
import Data.Vector qualified as V
|
||||||
|
import Data.Vector.Unboxed qualified as VU
|
||||||
|
import Data.Word
|
||||||
|
import Data.Word4 (CompressWord4List, Word4, fromCompressedWord4List, toCompressedWord4List)
|
||||||
import GHC.Generics (Generic)
|
import GHC.Generics (Generic)
|
||||||
import GHC.Natural (Natural)
|
import GHC.Natural (Natural)
|
||||||
import qualified Relude.Unsafe as US
|
import Relude.Unsafe qualified as U
|
||||||
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
|
import Relude.Unsafe qualified as US
|
||||||
import qualified Data.ByteString as BS
|
|
||||||
import qualified Relude.Unsafe as U
|
|
||||||
import Codec.Winery (WineryRecord)
|
|
||||||
import Compress.BurrowsWheeler (toBurrowsWheeler)
|
|
||||||
import qualified Data.Bit as BV
|
|
||||||
import qualified Data.Vector as V
|
|
||||||
import qualified Data.Bit as VB
|
|
||||||
import qualified Data.Vector.Unboxed as VU
|
|
||||||
import Data.Dirs
|
|
||||||
-- import qualified Debug.Trace as D
|
|
||||||
import qualified Data.Proxy as P
|
|
||||||
import Compress.WordMarkovStats as WM
|
|
||||||
|
|
||||||
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
||||||
|
|
||||||
-- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer)
|
-- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer)
|
||||||
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
|
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
|
||||||
|
|
||||||
|
|
||||||
toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
||||||
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
|
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
|
||||||
where
|
where
|
||||||
|
|
||||||
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
|
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
|
||||||
|
|
||||||
maxBound' :: Integer
|
maxBound' :: Integer
|
||||||
maxBound' = fromIntegral (maxBound :: a)
|
maxBound' = fromIntegral (maxBound :: a)
|
||||||
|
|
||||||
resize xs = map ( / sum' ) xs
|
resize xs = map (/ sum') xs
|
||||||
where
|
where
|
||||||
sum' = sum xs
|
sum' = sum xs
|
||||||
|
|
||||||
withBumpedZeroSized :: [Rational]
|
withBumpedZeroSized :: [Rational]
|
||||||
withBumpedZeroSized
|
withBumpedZeroSized =
|
||||||
= resize
|
resize
|
||||||
. map (max minimumSize)
|
. map (max minimumSize)
|
||||||
. resize
|
. resize
|
||||||
. map (fromIntegral . snd)
|
. map (fromIntegral . snd)
|
||||||
$ xs
|
$ xs
|
||||||
where
|
where
|
||||||
-- | we have to round the 0s up to something
|
-- \| we have to round the 0s up to something
|
||||||
-- | we somewhat arbitrarily choose half of the smallest
|
-- \| we somewhat arbitrarily choose half of the smallest
|
||||||
-- | normally representable amount
|
-- \| normally representable amount
|
||||||
minimumSize :: Rational
|
minimumSize :: Rational
|
||||||
minimumSize = 1 % (maxBound' * 2)
|
minimumSize = 1 % (maxBound' * 2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
|
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
|
||||||
where
|
where
|
||||||
newFrac = currFrac * (1 - runningSum)
|
newFrac = currFrac * (1 - runningSum)
|
||||||
|
|
||||||
relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)]
|
relativeCounts :: forall k. (Ord k) => [(k, Integer)] -> [(k, Rational)]
|
||||||
relativeCounts m = map (Bi.second asFractionOfSum) m
|
relativeCounts m = map (Bi.second asFractionOfSum) m
|
||||||
where
|
where
|
||||||
asFractionOfSum = ( % sum') . fromIntegral
|
asFractionOfSum = (% sum') . fromIntegral
|
||||||
|
|
||||||
sum' = fromIntegral . F.sum . map snd $ m
|
sum' = fromIntegral . F.sum . map snd $ m
|
||||||
|
|
||||||
|
maxBound' :: forall a b. (Num b, Bounded a, Integral a) => P.Proxy a -> b
|
||||||
|
|
||||||
maxBound' :: forall a b . (Num b, Bounded a, Integral a) => P.Proxy a -> b
|
|
||||||
maxBound' p = fromIntegral (maxBound :: a)
|
maxBound' p = fromIntegral (maxBound :: a)
|
||||||
|
|
||||||
twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)]
|
listToRelativeCounts :: forall k. (Ord k) => [k] -> [(k, Rational)]
|
||||||
twoByteMarkov xs =
|
listToRelativeCounts xs =
|
||||||
M.map sizeAsFraction
|
relativeCounts
|
||||||
. M.fromListWith (M.unionWith (+))
|
. M.assocs
|
||||||
. zip xs
|
. M.fromListWith (+)
|
||||||
. map (`M.singleton` (1 :: Integer))
|
. map (,1)
|
||||||
. US.tail
|
|
||||||
$ xs
|
$ xs
|
||||||
where
|
|
||||||
|
|
||||||
sizeAsFraction :: M.Map k Integer -> [(k, Rational)]
|
|
||||||
sizeAsFraction = relativeCounts . M.assocs
|
|
||||||
|
|
||||||
data Chunk a = Chunk
|
data Chunk a = Chunk
|
||||||
{ location :: SerializedDirs,
|
{ location :: SerializedDirs,
|
||||||
start :: a,
|
|
||||||
length :: Word64
|
length :: Word64
|
||||||
}
|
}
|
||||||
deriving (Eq, Ord, Show, Generic, P.NFData)
|
deriving (Eq, Ord, Show, Generic, P.NFData)
|
||||||
deriving C.Serialise via C.WineryVariant (Chunk a)
|
deriving (C.Serialise) via C.WineryVariant (Chunk a)
|
||||||
|
|
||||||
-- maxBound = maxBound' (P.Proxy :: Proxy b)
|
-- maxBound = maxBound' (P.Proxy :: Proxy b)
|
||||||
|
|
||||||
discretizeFraction :: forall b . (Bounded b, Integral b) => Rational -> b
|
discretizeFraction :: forall b. (Bounded b, Integral b) => Rational -> b
|
||||||
discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b)))
|
discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b)))
|
||||||
where
|
where
|
||||||
maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
|
maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
|
||||||
|
|
||||||
|
|
||||||
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
|
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
|
||||||
type Word4Map a = M.Map a [(a, Word8)]
|
|
||||||
|
|
||||||
|
type Word4Map a = M.Map a [(a, Word8)]
|
||||||
|
|
||||||
-- talkAboutCompressed (Compressed {..}) = (
|
-- talkAboutCompressed (Compressed {..}) = (
|
||||||
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
|
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
|
||||||
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
|
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
|
||||||
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
|
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
|
||||||
|
|
||||||
newtype Prediction = Prediction {
|
newtype Prediction = Prediction
|
||||||
chances :: [(Word8, Rational)]
|
{ chances :: [(Word8, Rational)]
|
||||||
}
|
}
|
||||||
|
|
||||||
class CompressionModel a where
|
class CompressionModel a where
|
||||||
createModel :: [Word8] -> a
|
createModel :: [Word8] -> a
|
||||||
|
|
@ -136,7 +121,7 @@ class CompressionModel a where
|
||||||
|
|
||||||
-- at some point write
|
-- at some point write
|
||||||
-- instance IsoSerialisable a b => C.Serialise b
|
-- instance IsoSerialisable a b => C.Serialise b
|
||||||
class C.Serialise b => IsoSerialisable a b where
|
class (C.Serialise b) => IsoSerialisable a b where
|
||||||
toSerialisable :: a -> b
|
toSerialisable :: a -> b
|
||||||
|
|
||||||
fromSerialisable :: b -> a
|
fromSerialisable :: b -> a
|
||||||
|
|
@ -145,14 +130,14 @@ class C.Serialise b => IsoSerialisable a b where
|
||||||
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
|
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
|
||||||
-- deriving C.Serialise via C.WineryRecord BetterCompressed
|
-- deriving C.Serialise via C.WineryRecord BetterCompressed
|
||||||
|
|
||||||
data Compressed a mapImplementation = Compressed
|
data Compressed a = Compressed
|
||||||
{ markovs :: mapImplementation,
|
{
|
||||||
|
-- needs a better name than markovs
|
||||||
|
frequencies :: [(a, Word8)], -- we'll paramaterize this at some point::168
|
||||||
chunks :: [Chunk a]
|
chunks :: [Chunk a]
|
||||||
}
|
}
|
||||||
deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
|
deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
|
||||||
deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation)
|
deriving (C.Serialise) via C.WineryVariant (Compressed a)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
-- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
|
-- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
|
||||||
-- unserealizeWord4Map = M.map unserializedIndividualMap
|
-- unserealizeWord4Map = M.map unserializedIndividualMap
|
||||||
|
|
@ -162,39 +147,47 @@ data Compressed a mapImplementation = Compressed
|
||||||
|
|
||||||
-- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
|
-- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
|
||||||
|
|
||||||
|
data WordMarkovStats = WordMarkovStats
|
||||||
|
{ location :: Ratio Integer,
|
||||||
|
size :: Ratio Integer
|
||||||
|
}
|
||||||
|
deriving (Show, Eq, Ord, Generic)
|
||||||
|
|
||||||
-- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
|
-- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
|
||||||
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
|
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
|
||||||
|
|
||||||
decompress ::
|
decompress ::
|
||||||
forall a.
|
forall a.
|
||||||
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||||
Compressed a (Word4Map a) ->
|
Compressed a ->
|
||||||
[a]
|
[a]
|
||||||
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk
|
decompress (Compressed {..}) =
|
||||||
$ chunks
|
concat . P.runPar . P.parMap decompressChunk $
|
||||||
|
chunks
|
||||||
where
|
where
|
||||||
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
|
rings :: M.Map Rational (a, Rational)
|
||||||
|
rings = M.fromList . map toDecompressionRing . M.assocs . toRing $ frequencies
|
||||||
markovs' = -- unserealizeWord4Map
|
|
||||||
markovs
|
|
||||||
|
|
||||||
|
|
||||||
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
||||||
|
|
||||||
decompressChunk :: Chunk a -> [a]
|
decompressChunk :: Chunk a -> [a]
|
||||||
decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start)
|
decompressChunk (Chunk {..}) =
|
||||||
|
map snd
|
||||||
|
. take (fromIntegral length)
|
||||||
|
. US.tail
|
||||||
|
. L.iterate' (decompress' . fst)
|
||||||
|
$ (location', 0) -- (fromBS rawLocation, start)
|
||||||
where
|
where
|
||||||
|
|
||||||
location' = deserialize location
|
location' = deserialize location
|
||||||
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
||||||
decompress' (!loc, !prevWord) = -- traceShow (newSize) $
|
decompress' !loc =
|
||||||
((loc - ansLoc) / newSize, newVal)
|
-- traceShow (newSize) $
|
||||||
|
((loc - ansLoc) / newSize, newVal)
|
||||||
where
|
where
|
||||||
ansLoc :: Ratio Integer
|
ansLoc :: Ratio Integer
|
||||||
newVal :: a
|
newVal :: a
|
||||||
newSize :: Ratio Integer
|
newSize :: Ratio Integer
|
||||||
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord)
|
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ rings
|
||||||
|
|
||||||
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
||||||
sanityCheck = map (sum . M.elems) . M.elems
|
sanityCheck = map (sum . M.elems) . M.elems
|
||||||
|
|
@ -210,7 +203,6 @@ chunk chunkSize = chunk'
|
||||||
chunkLength :: Int
|
chunkLength :: Int
|
||||||
chunkLength = 4096
|
chunkLength = 4096
|
||||||
|
|
||||||
|
|
||||||
-- runPar = id
|
-- runPar = id
|
||||||
-- parMap = map
|
-- parMap = map
|
||||||
|
|
||||||
|
|
@ -219,48 +211,35 @@ chunkLength = 4096
|
||||||
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||||
-- [a] ->
|
-- [a] ->
|
||||||
-- Compressed a (Word4MapSerialized a)
|
-- Compressed a (Word4MapSerialized a)
|
||||||
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a)
|
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a
|
||||||
compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
|
compress toCompress =
|
||||||
(compressChunk)
|
Compressed relativeFrequencies
|
||||||
$ unCompressedChunks
|
. P.runPar
|
||||||
|
. P.parMap compressChunk
|
||||||
|
$ unCompressedChunks
|
||||||
where
|
where
|
||||||
|
|
||||||
toCompress = toCompress' -- toBurrowsWheeler toCompress'
|
|
||||||
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
|
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
|
||||||
|
|
||||||
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
|
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
|
||||||
|
|
||||||
unCompressedChunks = chunk chunkLength toCompress
|
unCompressedChunks = chunk chunkLength toCompress
|
||||||
|
|
||||||
twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress
|
relativeFrequencies = map (Bi.second discretizeFraction) . listToRelativeCounts $ toCompress
|
||||||
|
|
||||||
rings = M.map toRing twoByteMarkovs
|
rings = toRing relativeFrequencies
|
||||||
|
|
||||||
compressChunk toCompress =
|
compressChunk toCompress =
|
||||||
Chunk
|
Chunk
|
||||||
{ location = shortestLocation endStats,
|
{ location = shortestLocation endStats,
|
||||||
start = US.head toCompress,
|
|
||||||
length = fromIntegral . L.length $ toCompress
|
length = fromIntegral . L.length $ toCompress
|
||||||
}
|
}
|
||||||
where
|
where
|
||||||
pairs = zip toCompress . US.tail $ toCompress
|
|
||||||
|
|
||||||
shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
|
shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
|
||||||
-- shortestLocation (WordMarkovStats {..})
|
|
||||||
-- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer
|
|
||||||
-- | otherwise = simplestBetween location $ upperBound - epsilon
|
|
||||||
-- where
|
|
||||||
-- simplestBetweenAnswer = simplestBetween location upperBound
|
|
||||||
|
|
||||||
-- upperBound = location + size
|
endStats = pyramidFold addWordMarkovStats . map statsFor $ toCompress
|
||||||
|
|
||||||
-- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size
|
statsFor = (rings M.!)
|
||||||
-- -- | and should make for a fairly quick subtraction
|
|
||||||
-- epsilon = 1 % (denominator size)
|
|
||||||
|
|
||||||
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
|
|
||||||
|
|
||||||
statsFor (x0, x1) = (rings M.! x0) M.! x1
|
|
||||||
|
|
||||||
addWordMarkovStats
|
addWordMarkovStats
|
||||||
(WordMarkovStats {location = prevLoc, size = prevSize})
|
(WordMarkovStats {location = prevLoc, size = prevSize})
|
||||||
|
|
@ -270,7 +249,6 @@ compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
|
||||||
size = prevSize * nextSize
|
size = prevSize * nextSize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pyramidFold :: (a -> a -> a) -> [a] -> a
|
pyramidFold :: (a -> a -> a) -> [a] -> a
|
||||||
pyramidFold f = pyramid
|
pyramidFold f = pyramid
|
||||||
where
|
where
|
||||||
|
|
@ -286,7 +264,7 @@ simplestBetween :: Rational -> Rational -> Rational
|
||||||
simplestBetween x y
|
simplestBetween x y
|
||||||
| x == y = x
|
| x == y = x
|
||||||
| x > 0 = simplestBetween' n d n' d'
|
| x > 0 = simplestBetween' n d n' d'
|
||||||
-- | x > 0 = simplestBetween' n d n' d'
|
-- \| x > 0 = simplestBetween' n d n' d'
|
||||||
| otherwise = 0 % 1
|
| otherwise = 0 % 1
|
||||||
where
|
where
|
||||||
n = numerator x
|
n = numerator x
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ import Options.Generic qualified as O
|
||||||
import System.Environment qualified as SE
|
import System.Environment qualified as SE
|
||||||
import Data.HuffmanTree
|
import Data.HuffmanTree
|
||||||
import Data.Word4 (CompressWord4List(xs))
|
import Data.Word4 (CompressWord4List(xs))
|
||||||
import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts)
|
import Codec.Winery qualified as C
|
||||||
|
|
||||||
decompress
|
decompress
|
||||||
:: forall a
|
:: forall a
|
||||||
|
|
@ -85,13 +85,13 @@ compress dividedByteString =
|
||||||
treeDirMap :: M.Map a [TreeDir]
|
treeDirMap :: M.Map a [TreeDir]
|
||||||
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
|
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
|
||||||
|
|
||||||
compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
|
-- compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
|
||||||
compressWithBurrowsWheeler bs
|
-- compressWithBurrowsWheeler bs
|
||||||
= liftA2 BurrowsWheelerCompressed
|
-- = liftA2 BurrowsWheelerCompressed
|
||||||
(compress . map fst $ withCounts)
|
-- (compress . map fst $ withCounts)
|
||||||
(compress . map snd $ withCounts)
|
-- (compress . map snd $ withCounts)
|
||||||
where
|
-- where
|
||||||
withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
|
-- withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
|
||||||
|
|
||||||
-- testCompression
|
-- testCompression
|
||||||
-- :: forall a
|
-- :: forall a
|
||||||
|
|
@ -136,18 +136,18 @@ divideByteString n bs = x : divideByteString n xs
|
||||||
(x, xs) = BS.splitAt n bs
|
(x, xs) = BS.splitAt n bs
|
||||||
|
|
||||||
|
|
||||||
compressionRatioFor
|
-- compressionRatioFor
|
||||||
:: forall a b
|
-- :: forall a b
|
||||||
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
|
-- . (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
|
||||||
Integral b, Ord b, C.Serialise b, Bounded b)
|
-- Integral b, Ord b, C.Serialise b, Bounded b)
|
||||||
=> D.Proxy a
|
-- => D.Proxy a
|
||||||
-> D.Proxy b
|
-- -> D.Proxy b
|
||||||
-> BS.ByteString
|
-- -> BS.ByteString
|
||||||
-> Double
|
-- -> Double
|
||||||
compressionRatioFor _ _ bs =
|
-- compressionRatioFor _ _ bs =
|
||||||
(/ (fromIntegral . BS.length $ bs))
|
-- (/ (fromIntegral . BS.length $ bs))
|
||||||
. fromIntegral
|
-- . fromIntegral
|
||||||
. BS.length
|
-- . BS.length
|
||||||
. C.serialise
|
-- . C.serialise
|
||||||
. (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
|
-- . (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
|
||||||
$ bs
|
-- $ bs
|
||||||
|
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
module Compress.WordMarkovStats where
|
|
||||||
|
|
||||||
|
|
||||||
data WordMarkovStats = WordMarkovStats
|
|
||||||
{ location :: Ratio Integer,
|
|
||||||
size :: Ratio Integer
|
|
||||||
}
|
|
||||||
deriving (Show, Eq, Ord, Generic)
|
|
||||||
50
test/Test.hs
50
test/Test.hs
|
|
@ -1,7 +1,5 @@
|
||||||
import Compress.Arithmetic (twoByteMarkov)
|
|
||||||
import Compress.Arithmetic qualified as A
|
import Compress.Arithmetic qualified as A
|
||||||
import Compress.LengthDistancePairs as LDP
|
import Compress.LengthDistancePairs as LDP
|
||||||
import Compress.WordMarkovStats as WM
|
|
||||||
import Data.Bifunctor qualified as Bi
|
import Data.Bifunctor qualified as Bi
|
||||||
import Data.ByteString qualified as BS
|
import Data.ByteString qualified as BS
|
||||||
import Data.Dirs as D
|
import Data.Dirs as D
|
||||||
|
|
@ -28,24 +26,24 @@ tests toCompress =
|
||||||
"falsify"
|
"falsify"
|
||||||
[ testGroup
|
[ testGroup
|
||||||
"Arithmetic"
|
"Arithmetic"
|
||||||
[ testCase "works" $
|
[ -- testCase "works" $
|
||||||
A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
|
-- A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
|
||||||
@?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
|
-- @?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
|
||||||
testCase "relativeCounts works as expected with one param" $
|
-- testCase "relativeCounts works as expected with one param" $
|
||||||
A.relativeCounts [(0, 30)]
|
-- A.relativeCounts [(0, 30)]
|
||||||
@?= [(0, 1)],
|
-- @?= [(0, 1)],
|
||||||
testCase "relativeCounts works as expected with lots of params" $
|
-- testCase "relativeCounts works as expected with lots of params" $
|
||||||
A.relativeCounts [(0, 30), (1, 20)]
|
-- A.relativeCounts [(0, 30), (1, 20)]
|
||||||
@?= [(0, 30 % 50), (1, 20 % 50)],
|
-- @?= [(0, 30 % 50), (1, 20 % 50)],
|
||||||
testCase "toRing locs all less than 1" $
|
-- testCase "toRing locs all less than 1" $
|
||||||
assertBool "larger than one" $
|
-- assertBool "larger than one" $
|
||||||
all (all ((<= 1) . (.location))) toCompressRing,
|
-- all (all ((<= 1) . (.location))) toCompressRing,
|
||||||
testCase "toRing sizes all add up to 1" $
|
-- testCase "toRing sizes all add up to 1" $
|
||||||
assertBool "larger than 1" $
|
-- assertBool "larger than 1" $
|
||||||
all ((== 1) . sum . map (.size)) toCompressRing,
|
-- all ((== 1) . sum . map (.size)) toCompressRing,
|
||||||
testCase "toRing gives no zero sizes" $
|
-- testCase "toRing gives no zero sizes" $
|
||||||
assertBool "== 0" $
|
-- assertBool "== 0" $
|
||||||
all (all ((/= 0) . (.size))) toCompressRing,
|
-- all (all ((/= 0) . (.size))) toCompressRing,
|
||||||
F.testProperty "binary search" propBinarySearchWithinBounds,
|
F.testProperty "binary search" propBinarySearchWithinBounds,
|
||||||
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
|
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
|
||||||
]
|
]
|
||||||
|
|
@ -55,12 +53,12 @@ tests toCompress =
|
||||||
-- ]
|
-- ]
|
||||||
]
|
]
|
||||||
where
|
where
|
||||||
toCompressRing =
|
-- toCompressRing =
|
||||||
map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
|
-- map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
|
||||||
. M.elems
|
-- . M.elems
|
||||||
. A.twoByteMarkov
|
-- . A.twoByteMarkov
|
||||||
. BS.unpack
|
-- . BS.unpack
|
||||||
$ toCompress
|
-- $ toCompress
|
||||||
|
|
||||||
wordMaxBound :: Integer
|
wordMaxBound :: Integer
|
||||||
wordMaxBound = fromIntegral (maxBound :: Word)
|
wordMaxBound = fromIntegral (maxBound :: Word)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue