benchmark, test suite, and run length encoding

This commit is contained in:
Jack Wines 2025-10-27 13:15:58 -07:00
parent 2123636291
commit da83f9a5d0
Signed by: Jack
SSH key fingerprint: SHA256:AaP2Hr/e3mEjeY+s9XJmQqAesqEms8ENRhwRkpO0WUk
19 changed files with 884 additions and 364 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@
/cabal.project.local
/compress.hp
/*.prof
/calgarycorpus/

15
bench/Bench.hs Normal file
View file

@ -0,0 +1,15 @@
import Criterion.Main
import Compress.LengthDistancePairs
import qualified Data.ByteString as BS
import qualified Compress.Arithmetic as A
main = do
testData <- BS.unpack . BS.take 10000 <$> (BS.readFile "pg64317.txt")
defaultMain [
bgroup "encode" [
bench "length distance pair" $ nf (encode :: [Word8] -> [LengthDistancePair Word32]) testData
],
bgroup "arithmetic coding" [
bench "pg64317.txt" $ nf A.compress testData
]
]

View file

@ -6,23 +6,26 @@ build-type: Simple
license: MIT
license-file: LICENSE
executable compress
main-is:
Main.hs
common deps
default-extensions:
DataKinds,
DeriveAnyClass,
DeriveGeneric,
DerivingStrategies,
DerivingVia,
DuplicateRecordFields,
ExtendedDefaultRules,
FlexibleContexts,
FlexibleInstances,
ImpredicativeTypes,
InstanceSigs,
LambdaCase,
MultiParamTypeClasses,
NamedFieldPuns,
NoFieldSelectors,
OverloadedLabels,
OverloadedLists,
OverloadedRecordDot,
OverloadedStrings,
PartialTypeSignatures,
RankNTypes,
@ -32,15 +35,12 @@ executable compress
StandaloneDeriving,
StrictData,
TemplateHaskell,
LambdaCase,
TupleSections,
TypeApplications,
TypeFamilies,
TypeOperators,
TypeSynonymInstances,
UndecidableInstances,
hs-source-dirs:
src
mixins:
base hiding (Prelude),
relude (Relude as Prelude),
@ -48,17 +48,21 @@ executable compress
build-depends:
base,
bitvec,
bytestring,
cereal,
bytestring,
containers,
winery,
parsec,
-- accelerate,
-- containers-accelerate,
parsec,
leancheck,
monad-par,
monad-par-extras,
nonempty-containers,
optparse-generic,
relude,
pointless-fun,
text,
uuid,
vector,
@ -66,15 +70,63 @@ executable compress
default-language:
GHC2021
other-modules:
Data.PQueue
Data.FiniteBit
Compress.Arithmetic
Compress.BurrowsWheeler
Compress.Huffman
Compress.PrefixTree
Compress.WordMarkovStats
Compress.LengthDistancePairs
Data.Dirs
Data.FiniteBit
Data.HuffmanTree
Compress.Arithmetic
Data.PQueue
Data.Word4
Data.ArbitraryPrecisionFloatingPoint
-- Data.CircularList
hs-source-dirs:
src
ghc-options:
-threaded
-fprof-auto
-fprof-late
"-with-rtsopts=-p -hc -B -N -qa"
"-with-rtsopts=-N"
benchmark bench
import: deps
type:
exitcode-stdio-1.0
main-is:
Bench.hs
build-depends:
criterion
hs-source-dirs:
bench
ghc-options:
"-with-rtsopts=-T -p -hc -B -qa"
test-suite test
import: deps
type:
exitcode-stdio-1.0
main-is:
Test.hs
build-depends:
tasty,
tasty-hunit,
falsify
hs-source-dirs:
test
executable compress
import: deps
main-is:
Main.hs
hs-source-dirs:
src-exe
-- -fllvm

131
flake.lock generated
View file

@ -1,18 +1,34 @@
{
"nodes": {
"accelerate": {
"barbies": {
"flake": false,
"locked": {
"lastModified": 1732969010,
"narHash": "sha256-Qrmtrgij2GbklBXUK42Pt6Db8WiGijA5sz5oC5AR72c=",
"owner": "AccelerateHS",
"repo": "accelerate",
"rev": "02da6161ef143a9886c8bce542cd96029c4f527a",
"lastModified": 1712605099,
"narHash": "sha256-jDyIDPiGWAw4qLRoYA4p6njANOg4/EOCx0jmFl607IM=",
"owner": "jcpetruzza",
"repo": "barbies",
"rev": "856bc3d3cc72a13e95ed495afd15683c45c7cc55",
"type": "github"
},
"original": {
"owner": "AccelerateHS",
"repo": "accelerate",
"owner": "jcpetruzza",
"repo": "barbies",
"type": "github"
}
},
"barbies-th": {
"flake": false,
"locked": {
"lastModified": 1730091166,
"narHash": "sha256-kc3ObxnhAJW6vgroFIax/qBOrp3HIWoLRtu0jsJUGIE=",
"owner": "fumieval",
"repo": "barbies-th",
"rev": "46c7b8c68634b219ff12e7966983f9b46a5976d4",
"type": "github"
},
"original": {
"owner": "fumieval",
"repo": "barbies-th",
"type": "github"
}
},
@ -21,11 +37,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1733312601,
"narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=",
"lastModified": 1756770412,
"narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9",
"rev": "4524271976b625a4a605beefd893f270620fd751",
"type": "github"
},
"original": {
@ -51,11 +67,11 @@
},
"haskell-flake": {
"locked": {
"lastModified": 1734984991,
"narHash": "sha256-oUYtRBD3Yhw2jvKYo0lfd82fgEQQbFoiJcHO923gmOc=",
"lastModified": 1756607542,
"narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=",
"owner": "srid",
"repo": "haskell-flake",
"rev": "daf00052906bdd977e57a07f7048437214232e87",
"rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61",
"type": "github"
},
"original": {
@ -64,42 +80,6 @@
"type": "github"
}
},
"llvm-hs": {
"flake": false,
"locked": {
"lastModified": 1665495332,
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
"owner": "llvm-hs",
"repo": "llvm-hs",
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
"type": "github"
},
"original": {
"owner": "llvm-hs",
"ref": "llvm-12",
"repo": "llvm-hs",
"type": "github"
}
},
"llvm-hs-pure": {
"flake": false,
"locked": {
"dir": "llvm-hs-pure",
"lastModified": 1665495332,
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
"owner": "llvm-hs",
"repo": "llvm-hs",
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
"type": "github"
},
"original": {
"dir": "llvm-hs-pure",
"owner": "llvm-hs",
"ref": "llvm-12",
"repo": "llvm-hs",
"type": "github"
}
},
"mission-control": {
"locked": {
"lastModified": 1733438716,
@ -117,11 +97,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1734649271,
"narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=",
"lastModified": 1756542300,
"narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507",
"rev": "d7600c775f877cd87b4f5a831c28aa94137377aa",
"type": "github"
},
"original": {
@ -133,28 +113,31 @@
},
"nixpkgs-lib": {
"locked": {
"lastModified": 1733096140,
"narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
"lastModified": 1754788789,
"narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=",
"owner": "nix-community",
"repo": "nixpkgs.lib",
"rev": "a73b9c743612e4244d865a2fdee11865283c04e6",
"type": "github"
},
"original": {
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
"owner": "nix-community",
"repo": "nixpkgs.lib",
"type": "github"
}
},
"root": {
"inputs": {
"accelerate": "accelerate",
"barbies": "barbies",
"barbies-th": "barbies-th",
"flake-parts": "flake-parts",
"flake-root": "flake-root",
"haskell-flake": "haskell-flake",
"llvm-hs": "llvm-hs",
"llvm-hs-pure": "llvm-hs-pure",
"mission-control": "mission-control",
"nixpkgs": "nixpkgs",
"systems": "systems",
"treefmt-nix": "treefmt-nix"
"treefmt-nix": "treefmt-nix",
"winery": "winery"
}
},
"systems": {
@ -179,11 +162,11 @@
]
},
"locked": {
"lastModified": 1734982074,
"narHash": "sha256-N7M37KP7cHWoXicuE536GrVvU8nMDT/gpI1kja2hkdg=",
"lastModified": 1756662192,
"narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "e41e948cf097cbf96ba4dff47a30ea6891af9f33",
"rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4",
"type": "github"
},
"original": {
@ -191,6 +174,22 @@
"repo": "treefmt-nix",
"type": "github"
}
},
"winery": {
"flake": false,
"locked": {
"lastModified": 1732329065,
"narHash": "sha256-CDFOD4B6cJgAe3ebTVF21/eCJtbHMEzmKF3b/XxMOw4=",
"owner": "fumieval",
"repo": "winery",
"rev": "2d6976bc822f6d2c2d590703b9470cb9e62191a5",
"type": "github"
},
"original": {
"owner": "fumieval",
"repo": "winery",
"type": "github"
}
}
},
"root": "root",

View file

@ -8,19 +8,24 @@
flake-root.url = "github:srid/flake-root";
treefmt-nix.url = "github:numtide/treefmt-nix";
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
accelerate = {
url = "github:AccelerateHS/accelerate?submodules=1";
flake = false;
};
llvm-hs = {
url = "github:llvm-hs/llvm-hs?ref=llvm-12";
flake = false;
};
llvm-hs-pure = {
url = "github:llvm-hs/llvm-hs?dir=llvm-hs-pure&ref=llvm-12";
flake = false;
};
mission-control.url = "github:Platonic-Systems/mission-control";
winery = {
url = "github:fumieval/winery";
flake = false;
};
barbies-th = {
url = "github:fumieval/barbies-th";
flake = false;
};
barbies = {
url = "github:jcpetruzza/barbies";
flake = false;
};
};
outputs = inputs:
@ -38,7 +43,7 @@
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
haskellProjects.default = {
# The base package set (this value is the default)
basePackages = pkgs.haskell.packages.ghc96;
# basePackages = pkgs.haskell.packages.ghc910;
# Packages to add on top of `basePackages`
packages = {
@ -46,30 +51,32 @@
# (Local packages are added automatically)
# https://github.com/lehins/hip.git
# hip.source = inputs.hip + "/hip";
accelerate.source = inputs.accelerate;
# llvm-hs.source = inputs.llvm-hs;
llvm-hs-pure.source = inputs.llvm-hs-pure;
winery.source = inputs.winery;
barbies-th.source = inputs.barbies-th;
barbies.source = inputs.barbies;
};
# Add your package overrides here
settings = {
uuid.jailbreak = true;
accelerate =
{
jailbreak = true;
broken = false;
};
llvm-hs =
{
jailbreak = true;
broken = false;
};
llvm-hs-pure =
{
jailbreak = true;
broken = false;
};
# accelerate =
# {
# jailbreak = true;
# broken = false;
# };
# winery.broken = false;
winery.jailbreak = true;
winery.check = false;
barbies-th.broken = false;
barbies-th.jailbreak = true;
barbies.broken = false;
barbies.jailbreak = true;
barbies.check = false;
tree-diff.check = false;
# hlint.jailbreak = true;
# ghc-lib-parser.jailbreak = true;
# ghc-lib-parser-ex.jailbreak = true;
# gmp-with-cxx.jailbreak = true;
};
# hlint = {
# jailbreak = true;
@ -96,7 +103,7 @@
programs.ormolu.enable = true;
programs.nixpkgs-fmt.enable = true;
programs.cabal-fmt.enable = true;
programs.hlint.enable = false;
# programs.hlint.enable = false;
# We use fourmolu
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
@ -110,7 +117,6 @@
hoogle = {
description = "Start Hoogle server for project dependencies";
exec = ''
echo http://127.0.0.1:8888;
hoogle serve -p 8888 --local;
'';
category = "Dev Tools";
@ -139,6 +145,7 @@
config.treefmt.build.devShell
];
nativeBuildInputs = with pkgs; [
# llvmPackages_16.libllvm
];
};
};

97
src-exe/Main.hs Normal file
View file

@ -0,0 +1,97 @@
module Main where
import Compress.Huffman
import Compress.Huffman qualified as HT
import Compress.PrefixTree qualified as PT
import Compress.Arithmetic qualified as A
import Data.Bifunctor qualified as Bi
import Data.Bit qualified as B
import Data.Bits (Bits ((.|.)))
import Data.Bits qualified as B
import Data.ByteString qualified as BS
import Data.HuffmanTree
import Data.Proxy qualified as P
import Codec.Winery qualified as C
import Data.Word
import GHC.Generics (Generic)
import Options.Generic qualified as O
import qualified Data.FiniteBit as FB
import qualified Relude.Unsafe as U
import qualified Data.ByteString.Lazy as BSL
-- import Data.Word4 (Word4(Word4))
import Compress.BurrowsWheeler
import qualified Control.Monad.Par as P
import qualified Compress.LengthDistancePairs as LDP
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CLIOpts = CLIOpts
{ task :: CompressOrDecompress,
strategy :: CompressionStrategy
}
deriving (Show, Generic, O.ParseRecord)
-- applyCompressionOptions ::
-- forall a.
-- (Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialise a) =>
-- P.Proxy a ->
-- CLIOpts ->
-- BS.By(++ []) . teString ->
-- BS.ByteString
-- applyCompressionOptions _ (CLIOpts Compress Huffman) f =
-- C.serialise . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
-- applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
-- C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
-- applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
-- handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
-- applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
-- handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
handleError (Right (Just bs)) = bs
handleError _ = []
main :: IO ()
main = do
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
f <- BS.getContents
let f' = BS.unpack f
print . BS.pack . A.decompress . A.compress $ f'
-- print =<< (P.runParIO $ do
-- let fCounts = (toCounts :: [Word8] -> [(Word8, Word64)]) . {- toBurrowsWheeler . -} (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
-- let fAsWords = map fst fCounts
-- -- let compressedUnencoded {- :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) -} = (A.compress fAsWords)
-- -- print . A.talkAboutCompressed $ compressedUnencoded
-- -- let compressed = C.serialise (compressedUnencoded, A.compress . map snd $ fCounts)
-- -- huffmanCompressionRatio' <- P.spawnP $ (compressionRatioFor (Proxy :: Proxy Word8) (P.Proxy :: Proxy Word8) f)
-- -- arithmaticCodingCompressionRatio' <- P.spawnP $ ((fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
-- -- huffmanCompressionRatio <- P.get huffmanCompressionRatio'
-- -- arithmaticCodingCompressionRatio <- P.get arithmaticCodingCompressionRatio'
-- -- let lengthDistancePairsCompressedSize = fromIntegral . BS.length . C.serialise . (LDP.encode :: ByteString -> [LDP.LengthDistancePair Word16]) $ f
-- -- let lengthDistancePairsCompressionRatio :: Double = lengthDistancePairsCompressedSize / (fromIntegral $ BS.length f)
-- pure ()) -- (lengthDistancePairsCompressionRatio))
-- let decompressed = (A.decompress compressedUnencoded)
-- print ("huffman coding", huffmanCompressionRatio)
-- print ("compression ratio (arithmetic coding)", arithmaticCodingCompressionRatio)
-- print ("works?", decompressed == fAsWords)
-- print . take 10 . drop 70 . zip fAsWords $ decompressed
-- print . ("original length", ) . length $ fAsWords
-- let f = "hello tehre"
-- f <- BS.readFile "pg64317.txt"
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
-- print $ BS.length . C.encode $ compressed
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
-- BS.writeFile "outin.txt" decompressed
-- print (decompressed, f)
-- print $ BS.length decompressed
-- print $ BS.length f
-- print (decompressed == f)
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f

View file

@ -1,5 +1,6 @@
-- | https://en.wikipedia.org/wiki/Arithmetic_coding
module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized) where
-- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
module Compress.Arithmetic where
import Data.Bits qualified as B
import Data.Word
@ -16,12 +17,23 @@ import Data.Map.Strict qualified as M
import Data.Maybe qualified as My
import Data.Ord
import Data.Ratio
import Data.Serialize qualified as C
import Codec.Winery qualified as C
import GHC.Generics (Generic)
import GHC.Natural (Natural)
import qualified Relude.Unsafe as US
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
import qualified Data.ByteString as BS
import qualified Relude.Unsafe as U
import Codec.Winery (WineryRecord)
import Compress.BurrowsWheeler (toBurrowsWheeler)
import qualified Data.Bit as BV
import qualified Data.Vector as V
import qualified Data.Bit as VB
import qualified Data.Vector.Unboxed as VU
import Data.Dirs
-- import qualified Debug.Trace as D
import qualified Data.Proxy as P
import Compress.WordMarkovStats as WM
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
@ -29,41 +41,52 @@ import qualified Data.ByteString as BS
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic, C.Serialize)
loc' (WordMarkovStats {..}) = location
toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
where
sizes = map ((% maxBound') . fromIntegral . snd) xs
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
asFracsOfTotalSum = map fst . US.tail . L.scanl' f (0, 0) $ sizes
maxBound' :: Integer
maxBound' = fromIntegral (maxBound :: a)
resize xs = map ( / sum' ) xs
where
sum' = sum xs
withBumpedZeroSized :: [Rational]
withBumpedZeroSized
| numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum
| otherwise = asFracsOfTotalSum
= resize
. map (max minimumSize)
. resize
. map (fromIntegral . snd)
$ xs
where
-- | we have to round the 0s up to something
-- | we somewhat arbitrarily choose half of the smallest
-- | normally representable amount
minimumSize :: Rational
minimumSize = 1 % (maxBound' * 2)
remainingSpace = 1 - sum asFracsOfTotalSum
numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
where
newFrac = currFrac * (1 - runningSum)
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)]
relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)]
relativeCounts m = map (Bi.second asFractionOfSum) m
where
asFractionOfSum = ( % sum') . fromIntegral
sum' = fromIntegral . F.sum . map snd $ m
maxBound' :: forall a b . (Num b, Bounded a, Integral a) => P.Proxy a -> b
maxBound' p = fromIntegral (maxBound :: a)
twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)]
twoByteMarkov xs =
M.map sizeAsFraction
. M.fromListWith (M.unionWith (+))
@ -72,77 +95,106 @@ twoByteMarkov xs =
. US.tail
$ xs
where
-- optimization, changes fractions so they represent the fraction
-- of remaining space in the list taken up instead of total space
sizeAsFraction :: M.Map k Integer -> [(k, b)]
sizeAsFraction m =
zip keys
. map discretizeFraction
. fractionOfRemainingSums
$ counts
where
fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs
asList = L.sortOn (negate . snd) . M.assocs $ m
sizeAsFraction :: M.Map k Integer -> [(k, Rational)]
sizeAsFraction = relativeCounts . M.assocs
keys = map fst asList
data Chunk a = Chunk
{ location :: SerializedDirs,
start :: a,
length :: Word64
}
deriving (Eq, Ord, Show, Generic, P.NFData)
deriving C.Serialise via C.WineryVariant (Chunk a)
counts = map snd asList
-- maxBound = maxBound' (P.Proxy :: Proxy b)
discretizeFraction :: Rational -> b
discretizeFraction = floor . fromRational . (* maxBound')
discretizeFraction :: forall b . (Bounded b, Integral b) => Rational -> b
discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b)))
where
maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
maxBound' :: Ratio Integer
maxBound' = fromIntegral (maxBound :: b)
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
type Word4Map a = M.Map a [(a, Word4)]
type Word4Map a = M.Map a [(a, Word8)]
-- talkAboutCompressed (Compressed {..}) = (
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
newtype Prediction = Prediction {
chances :: [(Word8, Rational)]
}
class CompressionModel a where
createModel :: [Word8] -> a
-- done in reverse
predict :: a -> [Word8] -> Prediction
-- at some point write
-- instance IsoSerialisable a b => C.Serialise b
class C.Serialise b => IsoSerialisable a b where
toSerialisable :: a -> b
fromSerialisable :: b -> a
-- newtype BetterCompressed = BetterCompressed (Compressed Word8 (A.Word4MapSerialized Word8))
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
-- deriving C.Serialise via C.WineryRecord BetterCompressed
data Compressed a mapImplementation = Compressed
{ markovs :: mapImplementation,
chunks :: [Chunk a]
}
deriving (Eq, Ord, Show, Generic) -- , C.Serialize, P.NFData)
deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation)
deriving instance (Ord a, C.Serialize a) => C.Serialize (Compressed a (Word4MapSerialized a))
data Chunk a = Chunk
{ location :: Ratio Integer,
start :: a,
length :: Int
}
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
-- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
-- unserealizeWord4Map = M.map unserializedIndividualMap
-- where
-- unserializedIndividualMap :: ([a], CompressWord4List) -> [(a, Word4)]
-- unserializedIndividualMap (bytes, sizes) = zip bytes . fromCompressedWord4List $ sizes
unserealizeWord4Map :: Word4MapSerialized a -> Word4Map a
unserealizeWord4Map = M.map (uncurry zip . Bi.second ((++ [maxBound]) . fromCompressedWord4List))
-- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
serealizeWord4Map = M.map (Bi.second (toCompressedWord4List . L.init) . L.unzip)
-- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
decompress ::
forall a.
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
Compressed a (Word4MapSerialized a) ->
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
Compressed a (Word4Map a) ->
[a]
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk
$ chunks
where
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
markovs' = unserealizeWord4Map markovs
markovs' = -- unserealizeWord4Map
markovs
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
decompressChunk :: Chunk a -> [a]
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start)
where
location' = deserialize location
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
decompress' (!loc, !prevWord) = -- traceShow (newSize) $
((loc - ansLoc) / newSize, newVal)
where
ansLoc :: Ratio Integer
newVal :: a
newSize :: Ratio Integer
(!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord)
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
sanityCheck = map (sum . M.elems) . M.elems
@ -156,18 +208,31 @@ chunk chunkSize = chunk'
(xs', xs'') = splitAt chunkSize xs
chunkLength :: Int
chunkLength = 8000
chunkLength = 4096
-- runPar = id
-- parMap = map
-- compress ::
-- forall a.
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
-- [a] ->
-- Compressed a (Word4MapSerialized a)
compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))) $ Compressed (serealizeWord4Map twoByteMarkovs) . P.runPar . P.parMap compressChunk $ unCompressedChunks
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a)
compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
(compressChunk)
$ unCompressedChunks
where
toCompress = toCompress' -- toBurrowsWheeler toCompress'
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
unCompressedChunks = chunk chunkLength toCompress
twoByteMarkovs = twoByteMarkov toCompress
twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress
rings = M.map toRing twoByteMarkovs
@ -175,12 +240,23 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
Chunk
{ location = shortestLocation endStats,
start = US.head toCompress,
length = L.length toCompress
length = fromIntegral . L.length $ toCompress
}
where
pairs = zip toCompress . US.tail $ toCompress
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
-- shortestLocation (WordMarkovStats {..})
-- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer
-- | otherwise = simplestBetween location $ upperBound - epsilon
-- where
-- simplestBetweenAnswer = simplestBetween location upperBound
-- upperBound = location + size
-- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size
-- -- | and should make for a fairly quick subtraction
-- epsilon = 1 % (denominator size)
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
@ -194,6 +270,7 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
size = prevSize * nextSize
}
pyramidFold :: (a -> a -> a) -> [a] -> a
pyramidFold f = pyramid
where
@ -209,6 +286,7 @@ simplestBetween :: Rational -> Rational -> Rational
simplestBetween x y
| x == y = x
| x > 0 = simplestBetween' n d n' d'
-- | x > 0 = simplestBetween' n d n' d'
| otherwise = 0 % 1
where
n = numerator x
@ -226,3 +304,15 @@ simplestBetween x y
nd'' = simplestBetween' d' r' d r
n'' = numerator nd''
d'' = denominator nd''
binarySearch x y = binarySearch' 0 1
where
acceptLow = min x y
acceptHigh = max x y
binarySearch' searchSpaceLow searchSpaceHigh
| mid < acceptLow = Higher : binarySearch' mid searchSpaceHigh
| mid > acceptHigh = Lower : binarySearch' searchSpaceLow mid
| otherwise = []
where
mid = (searchSpaceLow + searchSpaceHigh) / 2

View file

@ -0,0 +1,27 @@
-- |
module Compress.BurrowsWheeler where
-- import qualified Data.CircularList as CL
import qualified Relude.Unsafe as U
-- import Compress.BurrowsWheeler (toBurrowsWheeler)
toBurrowsWheeler (x:xs) = map U.head . sortOn (U.tail) . take (pred . length $ xs) . tails $ xs'
where
xs' = toList $ (x :| xs) <> (x :| [])
toCounts :: forall a b . (Eq a, Eq b, Num b, Enum b, Bounded a, Bounded b) => [a] -> [(a, b)]
toCounts = reverse . foldl' f []
where
f [] x = [(x, 0)]
f ((x, count) : xs) newVal
| x == newVal && count == maxBound = (x, 0) : (x, count) : xs
| x == newVal = (x, succ count) : xs
| otherwise = (newVal, 0) : (x, count) : xs
-- toBurrowsWheeler xs = map last . sort . map (findLyndonWord) . take (length xs) . tails $ xs'
-- where
-- xs' = xs ++ xs
-- findLyndonWord (x : xs) = (x :|) . map snd . takeWhile (uncurry (<=)) . zip (x : xs) $ xs

View file

@ -20,7 +20,7 @@ import Data.Bits as B
import Data.Maybe (fromMaybe)
import Data.Maybe qualified as My
import Data.PQueue qualified as PQ
import Data.Serialize qualified as C
import Codec.Winery qualified as C
import Data.Text qualified as T
import Data.Text.IO qualified as TIO
import Data.Vector.Unboxed qualified as V
@ -30,6 +30,8 @@ import GHC.Generics (Generic)
import Options.Generic qualified as O
import System.Environment qualified as SE
import Data.HuffmanTree
import Data.Word4 (CompressWord4List(xs))
import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts)
decompress
:: forall a
@ -50,13 +52,27 @@ decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompre
nextLeaf (L : xs) (Node {..}) = nextLeaf xs left
nextLeaf (R : xs) (Node {..}) = nextLeaf xs right
data BurrowsWheelerCompressed a b = BurrowsWheelerCompressed {
elements :: Compressed SerializedTreeDirs a,
counts :: Compressed SerializedTreeDirs b
}
deriving (Eq, Ord, Generic) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryRecord (BurrowsWheelerCompressed a b)
data Compressed treeDir a = Compressed {
treeDirs :: treeDir,
huffman :: HuffmanTree a
}
deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryRecord (Compressed treeDir a)
compress
:: forall a
. (Ord a, Integral a, B.FiniteBits a, B.Bits a)
=> BS.ByteString
-> Maybe (TreeDirs, HuffmanTree a)
compress bs =
liftA2 (,) (TreeDirs <$> treeDirections) mergedHuffmanTrees
. (Ord a, Integral a)
=> [a]
-> Maybe (Compressed SerializedTreeDirs a)
compress dividedByteString =
liftA2 Compressed (serializeTreeDirs . TreeDirs <$> treeDirections) mergedHuffmanTrees
where
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
@ -64,13 +80,19 @@ compress bs =
mergeHuffmanTrees
. PQ.fromList
. map (uncurry (flip (,)) . Bi.first Leaf)
. counts
. countOccurances
$ dividedByteString
treeDirMap :: M.Map a [TreeDir]
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
dividedByteString = toWordsList bs
compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
compressWithBurrowsWheeler bs
= liftA2 BurrowsWheelerCompressed
(compress . map fst $ withCounts)
(compress . map snd $ withCounts)
where
withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
-- testCompression
-- :: forall a
@ -100,10 +122,13 @@ compress bs =
-- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a))
-- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
counts :: (Ord a) => [a] -> [(a, Int)]
counts = M.toList . F.foldl' combiningInsert M.empty
where
combiningInsert m key = M.insertWith (+) key 1 m
-- I've replaced this with a simpler version but it might be slower
countOccurances :: (Ord a) => [a] -> [(a, Int)]
countOccurances = M.toList . M.fromListWith (+) . map (, 1)
-- countOccurances = M.toList . F.foldl' combiningInsert M.empty
-- where
-- combiningInsert m key = M.insertWith (+) key 1 m
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
divideByteString n [] = []
@ -113,15 +138,17 @@ divideByteString n bs = x : divideByteString n xs
compressionRatioFor
:: forall a
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialize a)
:: forall a b
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
Integral b, Ord b, C.Serialise b, Bounded b)
=> D.Proxy a
-> D.Proxy b
-> BS.ByteString
-> Double
compressionRatioFor proxy bs =
compressionRatioFor _ _ bs =
(/ (fromIntegral . BS.length $ bs))
. fromIntegral
. BS.length
. C.encode
. (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a))
. C.serialise
. (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
$ bs

View file

@ -0,0 +1,97 @@
module Compress.LengthDistancePairs where
import qualified Data.ByteString as BS
import Data.Word
import qualified Data.Foldable as F
import qualified Relude.Unsafe as US
import qualified Debug.Trace as D
import Compress.Arithmetic (maxBound')
import qualified Data.Proxy as P
import qualified Codec.Winery as C
import qualified Relude.Unsafe as U
data LengthDistancePair a = Literal Word8 | Repeat {
distance :: a,
matchLength :: Word8
}
deriving (Eq, Ord, Show, Generic, NFData)
deriving C.Serialise via (C.WineryVariant (LengthDistancePair a))
ldpLength :: LengthDistancePair a -> Int
ldpLength (Literal _) = 1
ldpLength (Repeat {..}) = fromIntegral matchLength
-- | normally, we'd order `previous` in reverse, but finding
-- | the longest-matching-subsequence starting at `next`
-- | doesn't let us.
-- | The linked list dosen't really lend itself to the problem
data Cursor = Cursor {
previous :: [Word8],
next :: [Word8]
} deriving (Eq, Ord, Show, Generic)
toCursor :: BS.ByteString -> Cursor
toCursor bs= Cursor {
previous = [],
next = BS.unpack bs
}
word8Max :: Int
word8Max = fromIntegral (maxBound :: Word8)
cutTo :: Int -> [a] -> [a]
cutTo newLength xs = drop (max 0 . flip (-) newLength . length $ xs) xs
decode :: forall a . Integral a => [LengthDistancePair a] -> [Word8]
decode = foldl' decodeIndividual []
where
decodeIndividual xs (Literal x) = xs ++ [x]
decodeIndividual xs (Repeat {..})
= (++) xs
. take (fromIntegral matchLength)
. drop (F.length xs - fromIntegral distance)
$ xs
encode :: forall a . (Integral a, Bounded a) => [Word8] -> [LengthDistancePair a]
encode xs = encode' xs xs 0
where
encode' :: [Word8] -> [Word8] -> Int -> [LengthDistancePair a]
encode' predictFrom [] n = []
encode' predictFrom next 0 = []
encode' predictFrom next distance = match : encode' (drop (maxLength - newDistance) predictFrom) (drop matchLength next) newDistance
where
(match, matchLength)
= F.maximumBy (comparing snd)
. map (\x -> (x, ldpLength x))
. ( Literal (U.head next) : )
. zipWith Repeat (reverse [1 .. (fromIntegral distance)])
. map lengthMatchWithRepeat . tails
. take distance
$ predictFrom
maxLength = fromIntegral (minBound :: a)
newDistance = max maxLength . (+ matchLength) $ distance
lengthMatchWithRepeat cyclexs' = zipWithCycle' cyclexs' next
where
zipWithCycle' [] ys = zipWithCycle' cyclexs' ys
zipWithCycle' (x:xs) (y:ys) | x == y = 1 + zipWithCycle' xs ys
zipWithCycle' _ _ = 0
-- lengthMatchWithRepeat candidate
-- = genericLength
-- . takeWhile (uncurry (==)) . zip (cycle candidate)
-- $ next
-- fasterEncode bs = encode' start start 0
-- where
-- encode' toPredict predictFrom maxDistance = (take diff previous)
-- start = BS.unpack bs take

View file

@ -17,9 +17,9 @@ import Data.PQueue qualified as PQ
import Debug.Trace qualified as D
import Debug.Trace qualified as T
import GHC.Generics
import qualified Data.Serialize as C
import Data.FiniteBit
import qualified Data.FiniteBit as B
import qualified Codec.Winery as C
data Tree a = (Ord a) =>
Tree
@ -28,7 +28,8 @@ data Tree a = (Ord a) =>
newtype HuffmanPrefixTree a b = HuffmanPrefixTree
{ inner :: M.Map a (HuffmanTree b)
} deriving (Eq, Ord, Show, Generic, C.Serialize)
} deriving (Eq, Ord, Show, Generic)
deriving C.Serialise via C.WineryRecord (HuffmanPrefixTree a b)
finiteBitTupleUncons ::
forall a b.
@ -85,7 +86,7 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
tree = toHuffmanTree . nGramCounts $ bs
treeDirMap :: M.Map a (M.Map b [TreeDir])
treeDirMap = M.map HT.findTreeDirections . Compress.PrefixTree.inner $ tree
treeDirMap = M.map HT.findTreeDirections tree.inner
initial :: Maybe a
initial = fst <$> finiteBitUncons bs
@ -96,37 +97,6 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
treeDirsFor :: (a, b) -> [TreeDir]
treeDirsFor (a, b) = (treeDirMap M.! a) M.! b
-- | all (M.null . children) . M.elems . children $ tree =
-- fmap End
-- . HT.fromList
-- . map (\x -> (prefixCounts x, x))
-- . M.keys
-- . children
-- $ tree
-- | otherwise =
-- Just
-- . Layer
-- . M.mapMaybeWithKey (\key val -> toHuffmanTree' (key : soFar) val)
-- . children
-- $ tree
-- where
-- prefixCounts :: a -> Int
-- prefixCounts x =
-- fromIntegral
-- . sum
-- . M.elems
-- . M.filterWithKey (\key val -> L.isPrefixOf (reverse . (x :) $ soFar) key)
-- $ nGrams
-- toHuffmanTree :: Tree a -> p1 -> HuffmanTree a
-- toHuffmanTree :: forall a . Tree a -> M.Map [a] Word -> HuffmanTree [a]
-- toHuffmanTree (Tree {..}) nGrams soFar | M.size children == 1 = Leaf . map (reverse . (: soFar)) . M.keys $ children
-- toHuffmanTree (Tree {..}) nGrams soFar = Leaf . map (reverse . (: soFar)) . M.keys $ children
-- where
-- sorted = L.sortBy (prefixCounts . fst) . M.toList $ children
nGramCounts ::
forall a b.
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
@ -155,34 +125,5 @@ fromSingleList (x : xs) = Tree . M.singleton x . fromSingleList $ xs
fromList :: (Ord a) => [[a]] -> Tree a
fromList = F.foldl' merge Compress.PrefixTree.empty . map fromSingleList
-- insert :: Ord a => Tree a -> [a] -> Tree a
-- insert (Tree {..}) (x:xs) =
merge :: Tree a -> Tree a -> Tree a
merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1
-- deriving instance Eq (Tree a)
-- deriving instance Ord (Tree a)
-- deriving instance (Show a) => Show (Tree a)
-- empty :: (Ord a) => Tree a
-- empty = Tree M.empty
-- fromList :: (Ord a, F.Foldable t) => t [a] -> Tree a
-- fromList = F.foldl' insert empty
-- insert :: Tree a -> [a] -> Tree a
-- insert (Tree {..}) [] = Tree M.empty
-- insert (Tree {..}) (x : xs) =
-- Tree
-- . flip (M.insert x) children
-- . flip insert xs
-- . My.fromMaybe empty
-- . M.lookup x
-- $ children
-- lookup :: (Ord a) => [a] -> Tree a -> Bool
-- lookup [] = const True
-- lookup (x : xs) = maybe False (Compress.PrefixTree.lookup xs) . M.lookup x . children

View file

@ -0,0 +1,8 @@
module Compress.WordMarkovStats where
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic)

View file

@ -0,0 +1,62 @@
module Data.ArbitraryPrecisionFloatingPoint where
import Compress.Arithmetic (binarySearch)
import Data.Function.Pointless
data FloatingPoint = FloatingPoint
{ exponent :: Integer,
significand :: Integer
}
deriving (Eq)
instance Num FloatingPoint where
(+) :: FloatingPoint -> FloatingPoint -> FloatingPoint
(+) = onEqualExponent (+)
(*) :: FloatingPoint -> FloatingPoint -> FloatingPoint
(*) n0 n1 = correctExponent $ FloatingPoint (n0.exponent + n1.exponent) (n0.significand * n1.significand)
abs :: FloatingPoint -> FloatingPoint
abs (FloatingPoint {..}) = FloatingPoint exponent (abs significand)
negate :: FloatingPoint -> FloatingPoint
negate (FloatingPoint {..}) = FloatingPoint exponent (negate significand)
fromInteger :: Integer -> FloatingPoint
fromInteger = correctExponent . FloatingPoint 0
signum :: FloatingPoint -> FloatingPoint
signum = FloatingPoint 0 . signum . (.significand)
onEqualExponent :: (Integer -> Integer -> Integer) -> FloatingPoint -> FloatingPoint -> FloatingPoint
onEqualExponent f n0 n1 = correctExponent . FloatingPoint n0'.exponent $ on f (.significand) n0' n1'
where
(n0', n1') = equalizeExponents n0 n1
-- | significand shouldn't have trailing 0s in binary
-- | exponent should be one larger instead
-- | this function corrects until siginicand isn't divisible by 2
correctExponent :: FloatingPoint -> FloatingPoint
correctExponent (FloatingPoint {..})
| significand /= 0 && remainder == 0 = correctExponent $ FloatingPoint (succ exponent) quotient
| otherwise = (FloatingPoint {..})
where
(quotient, remainder) = divMod significand 2
equalizeExponents :: FloatingPoint -> FloatingPoint -> (FloatingPoint, FloatingPoint)
equalizeExponents n0 n1 =
( withDiff n0 exponent0Diff,
withDiff n1 exponent1Diff
)
where
withDiff n exponentDiff = FloatingPoint (n.exponent + exponentDiff) (n.significand * (2 ^ exponentDiff))
signedExponentDiff = n0.exponent - n1.exponent
(exponent0Diff, exponent1Diff)
| signedExponentDiff < 0 = (0, abs signedExponentDiff)
| otherwise = (signedExponentDiff, 0)
instance Ord FloatingPoint where
compare :: FloatingPoint -> FloatingPoint -> Ordering
compare = uncurry (on compare (.significand)) .: equalizeExponents

52
src/Data/Dirs.hs Normal file
View file

@ -0,0 +1,52 @@
-- |
module Data.Dirs where
import qualified Data.ByteString as BS
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector as V
import qualified Data.Bit as VB
import qualified Data.Bit as BV
import Data.Ratio
import Control.Monad.Par qualified as P
import Codec.Winery qualified as C
import qualified Relude as R
data Dir = Higher | Lower deriving Show
data SerializedDirs = SerializedDirs {
bs :: BS.ByteString,
length :: Int
} deriving (Generic, Show, Ord, Eq, P.NFData, C.Serialise)
-- toBS :: [Dir] -> SerializedDirs
serialize dirs = SerializedDirs {..}
where
bs = VB.cloneToByteString . VU.fromList . map dirToBit $ dirs
length = R.length dirs
deserialize :: SerializedDirs -> Rational
deserialize (SerializedDirs {..}) = fromDirs . map bitToDir . VU.toList $ asVec
where
asVec = VU.take length . VB.cloneFromByteString $ bs
addNewBit x Higher = (x * 2) + 1
addNewBit x Lower = x * 2
fromDirs = sum . (0.5 :) . zipWith (*) fractions . map dirToNum
where
addNewBit x Higher = (x * 2) + 1
addNewBit x Lower = x * 2
dirToNum Higher = 1
dirToNum Lower = -1
bitToDir (BV.Bit True ) = Higher
bitToDir (BV.Bit False) = Lower
dirToBit Higher = BV.Bit True
dirToBit Lower = BV.Bit False
fractions :: [Rational]
fractions = iterate ( / 2) (1 % 4)

View file

@ -9,6 +9,9 @@ import Data.Vector.Unboxed qualified as V
import Data.Bit qualified as B
import Data.Bit qualified as BV
import Data.ByteString qualified as BS
import qualified Codec.Winery as C
import Data.FiniteBit (toByteString)
import Codec.Winery (serialise)
data HuffmanTree a
= Leaf a
@ -16,24 +19,29 @@ data HuffmanTree a
{ left :: HuffmanTree a,
right :: HuffmanTree a
}
deriving (Eq, Ord, Show, Generic, C.Serialize, Functor)
deriving (Eq, Ord, Show, Generic, Functor)
deriving C.Serialise via (C.WineryVariant (HuffmanTree a))
-- here so we can define our own Serialize instance
newtype TreeDirs = TreeDirs {
inner :: [TreeDir]
} deriving (Eq, Ord, Show)
-- deriving C.Serialize via C.WineryVariant TreeDirs
data TreeDir = L | R deriving (Eq, Ord, Show)
instance C.Serialize TreeDirs where
put :: C.Putter TreeDirs
put = C.put . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) . (inner :: TreeDirs -> [TreeDir])
newtype SerializedTreeDirs = SerializedTreeDirs
{
inner :: BS.ByteString
} deriving (Generic, Eq, Ord)
deriving C.Serialise via (C.WineryRecord SerializedTreeDirs)
get :: C.Get TreeDirs
get = do
(bs, len) <- C.get
pure . TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
serializeTreeDirs (TreeDirs inner) = SerializedTreeDirs . fst . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) $ inner
unserializeTreeDirs (SerializedTreeDirs bs) = TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
where
len = BS.length bs
lookup ::
forall a.

View file

@ -32,10 +32,10 @@ maxView :: PQueue a -> Maybe (Int, (a, PQueue a))
maxView = abstractView IM.lookupMax
insert :: Int -> p -> PQueue p -> PQueue p
insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . toMap
insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . (.toMap)
elems :: PQueue a -> [a]
elems = concatMap NE.toList . IM.elems . toMap
elems = concatMap NE.toList . IM.elems . (.toMap)
keys :: PQueue a -> [IM.Key]
keys = IM.keys . toMap
keys = IM.keys . (.toMap)

View file

@ -1,46 +1,62 @@
-- | note to myself. Something's wrong here.
-- | it doesn't work properly when fed into Compress.Arithmetic
module Data.Word4 where
import Data.Bits
import Data.Word
import qualified Data.Serialize as C
import qualified Codec.Winery as C
import qualified Data.Map as M
import qualified Text.Show as S
import qualified Relude.Unsafe as U
import Codec.Winery (WineryRecord)
-- only use lower 4 bits
newtype Word4 = Word4 {
inner :: Word8
} deriving (Eq, Ord, Show, Enum, Generic, C.Serialize)
} deriving (Eq, Ord, Enum, Generic)
deriving C.Serialise via C.WineryRecord Word4
instance S.Show Word4 where
show = show . (.inner)
map2 :: (Word8 -> Word8 -> Word8) -> Word4 -> Word4 -> Word4
map2 f (Word4 a) (Word4 b) = Word4 $ f a b
data CompressWord4List = CompressWord4List
{ xs :: [Word8],
last :: Maybe Word8 -- in lower 4 bits
} deriving (Eq, Ord, Generic, Show, C.Serialize, NFData)
newtype CompressWord4List = CompressWord4List
{ xs :: [Word8]
} deriving (Eq, Ord, Show, Generic)
deriving C.Serialise via C.WineryVariant CompressWord4List
bitwiseAnd a b = getAnd $ And a <> And b
bitwiseOr a b = getIor $ Ior a <> Ior b
toCompressedWord4List [] = CompressWord4List [] Nothing
toCompressedWord4List [Word4 x] = CompressWord4List [] (Just x)
toCompressedWord4List ((Word4 x) : (Word4 x') : xs) =
CompressWord4List
{ xs = headByte : xs',
last = last'
}
where
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
(CompressWord4List xs' last') = toCompressedWord4List xs
toCompressedWord4List = toCompressedWord4List' . U.init
where
toCompressedWord4List' [] = CompressWord4List []
toCompressedWord4List' [Word4 x] = CompressWord4List [x .<<. 4]
toCompressedWord4List' ((Word4 x) : (Word4 x') : xs) =
CompressWord4List
{ xs = headByte : xs'
}
where
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
(CompressWord4List xs') = toCompressedWord4List' xs
fromCompressedWord4List :: CompressWord4List -> [Word4]
fromCompressedWord4List (CompressWord4List [] Nothing) = []
fromCompressedWord4List (CompressWord4List [] (Just a)) = [word4 a]
fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
Word4 ((bitwiseAnd 0xf0 x) .>>. 4)
: word4 x
: fromCompressedWord4List (CompressWord4List xs last)
fromCompressedWord4List = fromCompressedWord4List' . (.xs)
where
fromCompressedWord4List' :: [Word8] -> [Word4]
fromCompressedWord4List' [] = []
fromCompressedWord4List' (x : xs) =
word4 (x .>>. 4)
: word4 x
: fromCompressedWord4List' xs
-- instance Show Word4 where
-- show :: Word4 -> String
@ -48,10 +64,10 @@ fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
instance Bounded Word4 where
minBound = word4 0
maxBound = word4 0xf
maxBound = word4 15
instance Real Word4 where
toRational = toRational . inner
toRational = toRational . (.inner)
instance Integral Word4 where
quot = map2 quot `on` clean
@ -66,9 +82,9 @@ instance Integral Word4 where
divMod (Word4 a) (Word4 b) = (word4 a', word4 b')
where
(a', b') = divMod a b
toInteger = toInteger . inner . clean
toInteger = toInteger . (.inner) . clean
clean (Word4 a) = Word4 . bitwiseAnd 0xf $ a
clean (Word4 a) = Word4 . bitwiseAnd 0x0f $ a
word4 = clean . Word4
instance Num Word4 where

View file

@ -1,71 +0,0 @@
module Main where
import Compress.Huffman
import Compress.Huffman qualified as HT
import Compress.PrefixTree (HuffmanPrefixTree (HuffmanPrefixTree))
import Compress.PrefixTree qualified as PT
import Compress.Arithmetic qualified as A
import Data.Bifunctor qualified as Bi
import Data.Bit qualified as B
import Data.Bits (Bits ((.|.)))
import Data.Bits qualified as B
import Data.ByteString qualified as BS
import Data.HuffmanTree
import Data.Proxy qualified as P
import Data.Serialize qualified as C
import Data.Word
import GHC.Generics (Generic)
import Options.Generic qualified as O
import qualified Data.FiniteBit as FB
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CLIOpts = CLIOpts
{ task :: CompressOrDecompress,
strategy :: CompressionStrategy
}
deriving (Show, Generic, O.ParseRecord)
applyCompressionOptions ::
forall a.
(Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialize a) =>
P.Proxy a ->
CLIOpts ->
BS.ByteString ->
BS.ByteString
applyCompressionOptions _ (CLIOpts Compress Huffman) f =
C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
handleError (Right (Just bs)) = bs
handleError _ = []
main :: IO ()
main = do
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
f <- BS.getContents
let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
let compressedUnencoded :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) = (A.compress fAsWords)
let compressed = C.encode compressedUnencoded
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
-- let f = "hello tehre"
-- f <- BS.readFile "pg64317.txt"
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
-- print $ BS.length . C.encode $ compressed
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
-- BS.writeFile "outin.txt" decompressed
-- print (decompressed, f)
-- print $ BS.length decompressed
-- print $ BS.length f
-- print (decompressed == f)
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f

92
test/Test.hs Normal file
View file

@ -0,0 +1,92 @@
import Compress.Arithmetic (twoByteMarkov)
import Compress.Arithmetic qualified as A
import Compress.LengthDistancePairs as LDP
import Compress.WordMarkovStats as WM
import Data.Bifunctor qualified as Bi
import Data.ByteString qualified as BS
import Data.Dirs as D
import Data.Map.Strict qualified as M
import Data.Ratio
import Test.Falsify.Generator qualified as Gen
import Test.Falsify.Predicate as P
import Test.Falsify.Property qualified as P
import Test.Falsify.Range qualified as Range
import Test.Tasty
import Test.Tasty.Falsify as F
import Test.Tasty.HUnit
-- import qualified GHC.Enum as B
main :: IO ()
main = do
toCompress <- -- BS.take 12000 <$>
BS.readFile "pg64317.txt"
defaultMain (tests toCompress)
tests toCompress =
testGroup
"falsify"
[ testGroup
"Arithmetic"
[ testCase "works" $
A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
@?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
testCase "relativeCounts works as expected with one param" $
A.relativeCounts [(0, 30)]
@?= [(0, 1)],
testCase "relativeCounts works as expected with lots of params" $
A.relativeCounts [(0, 30), (1, 20)]
@?= [(0, 30 % 50), (1, 20 % 50)],
testCase "toRing locs all less than 1" $
assertBool "larger than one" $
all (all ((<= 1) . (.location))) toCompressRing,
testCase "toRing sizes all add up to 1" $
assertBool "larger than 1" $
all ((== 1) . sum . map (.size)) toCompressRing,
testCase "toRing gives no zero sizes" $
assertBool "== 0" $
all (all ((/= 0) . (.size))) toCompressRing,
F.testProperty "binary search" propBinarySearchWithinBounds,
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
]
-- testGroup
-- "LengthDistancePair"
-- [ F.testProperty "compress and decompress isomorphism" $ (propCompressDecompressIsomorphism (LDP.decode . (LDP.encode :: [Word8] -> [LDP.LengthDistancePair Word32])))
-- ]
]
where
toCompressRing =
map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
. M.elems
. A.twoByteMarkov
. BS.unpack
$ toCompress
wordMaxBound :: Integer
wordMaxBound = fromIntegral (maxBound :: Word)
genProperFraction = F.gen . fmap ((% wordMaxBound) . fromIntegral) . Gen.inRange $ (Range.between (0, maxBound :: Word))
-- propBinarySearchWithinBounds :: Property' String ()
propBinarySearchWithinBounds = do
bound1 <- genProperFraction
epsilon' <- genProperFraction
let epsilon = max (1 % (fromIntegral (maxBound :: Word))) . (* 0.1) $ bound1 * epsilon'
let bound2 = if bound1 + epsilon < 1 then bound1 + epsilon else bound1 - epsilon
-- let bound2 = min 1 $ bound1 + epsilon
let iso = D.deserialize . D.serialize $ A.binarySearch bound1 bound2
P.assert $ P.le .$ ("minimum", min bound1 bound2) .$ ("iso", iso)
P.assert $ P.ge .$ ("maximum", max bound1 bound2) .$ ("iso", iso)
-- P.assert $ Range.between (min bound1 bound2) (max bound1 bound2) .$ (A.fromDirs $ A.binarySearch bound1 bound2)
-- propCompressDecompressIsomorphism :: F.Property ()
-- propCompressDecompressIsomorphism :: Property' String ()
propCompressDecompressIsomorphism :: ([Word8] -> [Word8]) -> Property' String ()
propCompressDecompressIsomorphism iso = do
(xs :: [Word8]) <-
F.gen $ Gen.list (Range.between (4, 5000)) intGen
P.assert $ eq .$ ("xs", xs) .$ ("changed", iso $ xs)
where
intGen :: F.Gen Word8
intGen = Gen.inRange $ Range.between (0, fromIntegral (maxBound :: Word8))