benchmark, test suite, and run length encoding

This commit is contained in:
Jack Wines 2025-10-27 13:15:58 -07:00
parent 2123636291
commit da83f9a5d0
Signed by: Jack
SSH key fingerprint: SHA256:AaP2Hr/e3mEjeY+s9XJmQqAesqEms8ENRhwRkpO0WUk
19 changed files with 884 additions and 364 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@
/cabal.project.local /cabal.project.local
/compress.hp /compress.hp
/*.prof /*.prof
/calgarycorpus/

15
bench/Bench.hs Normal file
View file

@ -0,0 +1,15 @@
import Criterion.Main
import Compress.LengthDistancePairs
import qualified Data.ByteString as BS
import qualified Compress.Arithmetic as A
main = do
testData <- BS.unpack . BS.take 10000 <$> (BS.readFile "pg64317.txt")
defaultMain [
bgroup "encode" [
bench "length distance pair" $ nf (encode :: [Word8] -> [LengthDistancePair Word32]) testData
],
bgroup "arithmetic coding" [
bench "pg64317.txt" $ nf A.compress testData
]
]

View file

@ -6,23 +6,26 @@ build-type: Simple
license: MIT license: MIT
license-file: LICENSE license-file: LICENSE
executable compress common deps
main-is:
Main.hs
default-extensions: default-extensions:
DataKinds, DataKinds,
DeriveAnyClass, DeriveAnyClass,
DeriveGeneric, DeriveGeneric,
DerivingStrategies,
DerivingVia,
DuplicateRecordFields, DuplicateRecordFields,
ExtendedDefaultRules, ExtendedDefaultRules,
FlexibleContexts, FlexibleContexts,
FlexibleInstances, FlexibleInstances,
ImpredicativeTypes, ImpredicativeTypes,
InstanceSigs, InstanceSigs,
LambdaCase,
MultiParamTypeClasses, MultiParamTypeClasses,
NamedFieldPuns, NamedFieldPuns,
NoFieldSelectors,
OverloadedLabels, OverloadedLabels,
OverloadedLists, OverloadedLists,
OverloadedRecordDot,
OverloadedStrings, OverloadedStrings,
PartialTypeSignatures, PartialTypeSignatures,
RankNTypes, RankNTypes,
@ -32,15 +35,12 @@ executable compress
StandaloneDeriving, StandaloneDeriving,
StrictData, StrictData,
TemplateHaskell, TemplateHaskell,
LambdaCase,
TupleSections, TupleSections,
TypeApplications, TypeApplications,
TypeFamilies, TypeFamilies,
TypeOperators, TypeOperators,
TypeSynonymInstances, TypeSynonymInstances,
UndecidableInstances, UndecidableInstances,
hs-source-dirs:
src
mixins: mixins:
base hiding (Prelude), base hiding (Prelude),
relude (Relude as Prelude), relude (Relude as Prelude),
@ -48,17 +48,21 @@ executable compress
build-depends: build-depends:
base, base,
bitvec, bitvec,
bytestring,
cereal, cereal,
bytestring,
containers, containers,
winery,
parsec,
-- accelerate, -- accelerate,
-- containers-accelerate, -- containers-accelerate,
parsec,
leancheck, leancheck,
monad-par, monad-par,
monad-par-extras, monad-par-extras,
nonempty-containers, nonempty-containers,
optparse-generic, optparse-generic,
relude, relude,
pointless-fun,
text, text,
uuid, uuid,
vector, vector,
@ -66,15 +70,63 @@ executable compress
default-language: default-language:
GHC2021 GHC2021
other-modules: other-modules:
Data.PQueue Compress.Arithmetic
Data.FiniteBit Compress.BurrowsWheeler
Compress.Huffman Compress.Huffman
Compress.PrefixTree Compress.PrefixTree
Compress.WordMarkovStats
Compress.LengthDistancePairs
Data.Dirs
Data.FiniteBit
Data.HuffmanTree Data.HuffmanTree
Compress.Arithmetic Data.PQueue
Data.Word4 Data.Word4
Data.ArbitraryPrecisionFloatingPoint
-- Data.CircularList
hs-source-dirs:
src
ghc-options: ghc-options:
-threaded -threaded
-fprof-auto "-with-rtsopts=-N"
-fprof-late
"-with-rtsopts=-p -hc -B -N -qa"
benchmark bench
import: deps
type:
exitcode-stdio-1.0
main-is:
Bench.hs
build-depends:
criterion
hs-source-dirs:
bench
ghc-options:
"-with-rtsopts=-T -p -hc -B -qa"
test-suite test
import: deps
type:
exitcode-stdio-1.0
main-is:
Test.hs
build-depends:
tasty,
tasty-hunit,
falsify
hs-source-dirs:
test
executable compress
import: deps
main-is:
Main.hs
hs-source-dirs:
src-exe
-- -fllvm

131
flake.lock generated
View file

@ -1,18 +1,34 @@
{ {
"nodes": { "nodes": {
"accelerate": { "barbies": {
"flake": false, "flake": false,
"locked": { "locked": {
"lastModified": 1732969010, "lastModified": 1712605099,
"narHash": "sha256-Qrmtrgij2GbklBXUK42Pt6Db8WiGijA5sz5oC5AR72c=", "narHash": "sha256-jDyIDPiGWAw4qLRoYA4p6njANOg4/EOCx0jmFl607IM=",
"owner": "AccelerateHS", "owner": "jcpetruzza",
"repo": "accelerate", "repo": "barbies",
"rev": "02da6161ef143a9886c8bce542cd96029c4f527a", "rev": "856bc3d3cc72a13e95ed495afd15683c45c7cc55",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "AccelerateHS", "owner": "jcpetruzza",
"repo": "accelerate", "repo": "barbies",
"type": "github"
}
},
"barbies-th": {
"flake": false,
"locked": {
"lastModified": 1730091166,
"narHash": "sha256-kc3ObxnhAJW6vgroFIax/qBOrp3HIWoLRtu0jsJUGIE=",
"owner": "fumieval",
"repo": "barbies-th",
"rev": "46c7b8c68634b219ff12e7966983f9b46a5976d4",
"type": "github"
},
"original": {
"owner": "fumieval",
"repo": "barbies-th",
"type": "github" "type": "github"
} }
}, },
@ -21,11 +37,11 @@
"nixpkgs-lib": "nixpkgs-lib" "nixpkgs-lib": "nixpkgs-lib"
}, },
"locked": { "locked": {
"lastModified": 1733312601, "lastModified": 1756770412,
"narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=", "narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=",
"owner": "hercules-ci", "owner": "hercules-ci",
"repo": "flake-parts", "repo": "flake-parts",
"rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9", "rev": "4524271976b625a4a605beefd893f270620fd751",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -51,11 +67,11 @@
}, },
"haskell-flake": { "haskell-flake": {
"locked": { "locked": {
"lastModified": 1734984991, "lastModified": 1756607542,
"narHash": "sha256-oUYtRBD3Yhw2jvKYo0lfd82fgEQQbFoiJcHO923gmOc=", "narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=",
"owner": "srid", "owner": "srid",
"repo": "haskell-flake", "repo": "haskell-flake",
"rev": "daf00052906bdd977e57a07f7048437214232e87", "rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -64,42 +80,6 @@
"type": "github" "type": "github"
} }
}, },
"llvm-hs": {
"flake": false,
"locked": {
"lastModified": 1665495332,
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
"owner": "llvm-hs",
"repo": "llvm-hs",
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
"type": "github"
},
"original": {
"owner": "llvm-hs",
"ref": "llvm-12",
"repo": "llvm-hs",
"type": "github"
}
},
"llvm-hs-pure": {
"flake": false,
"locked": {
"dir": "llvm-hs-pure",
"lastModified": 1665495332,
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
"owner": "llvm-hs",
"repo": "llvm-hs",
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
"type": "github"
},
"original": {
"dir": "llvm-hs-pure",
"owner": "llvm-hs",
"ref": "llvm-12",
"repo": "llvm-hs",
"type": "github"
}
},
"mission-control": { "mission-control": {
"locked": { "locked": {
"lastModified": 1733438716, "lastModified": 1733438716,
@ -117,11 +97,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1734649271, "lastModified": 1756542300,
"narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=", "narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=",
"owner": "nixos", "owner": "nixos",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507", "rev": "d7600c775f877cd87b4f5a831c28aa94137377aa",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -133,28 +113,31 @@
}, },
"nixpkgs-lib": { "nixpkgs-lib": {
"locked": { "locked": {
"lastModified": 1733096140, "lastModified": 1754788789,
"narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=", "narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=",
"type": "tarball", "owner": "nix-community",
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" "repo": "nixpkgs.lib",
"rev": "a73b9c743612e4244d865a2fdee11865283c04e6",
"type": "github"
}, },
"original": { "original": {
"type": "tarball", "owner": "nix-community",
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz" "repo": "nixpkgs.lib",
"type": "github"
} }
}, },
"root": { "root": {
"inputs": { "inputs": {
"accelerate": "accelerate", "barbies": "barbies",
"barbies-th": "barbies-th",
"flake-parts": "flake-parts", "flake-parts": "flake-parts",
"flake-root": "flake-root", "flake-root": "flake-root",
"haskell-flake": "haskell-flake", "haskell-flake": "haskell-flake",
"llvm-hs": "llvm-hs",
"llvm-hs-pure": "llvm-hs-pure",
"mission-control": "mission-control", "mission-control": "mission-control",
"nixpkgs": "nixpkgs", "nixpkgs": "nixpkgs",
"systems": "systems", "systems": "systems",
"treefmt-nix": "treefmt-nix" "treefmt-nix": "treefmt-nix",
"winery": "winery"
} }
}, },
"systems": { "systems": {
@ -179,11 +162,11 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1734982074, "lastModified": 1756662192,
"narHash": "sha256-N7M37KP7cHWoXicuE536GrVvU8nMDT/gpI1kja2hkdg=", "narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=",
"owner": "numtide", "owner": "numtide",
"repo": "treefmt-nix", "repo": "treefmt-nix",
"rev": "e41e948cf097cbf96ba4dff47a30ea6891af9f33", "rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4",
"type": "github" "type": "github"
}, },
"original": { "original": {
@ -191,6 +174,22 @@
"repo": "treefmt-nix", "repo": "treefmt-nix",
"type": "github" "type": "github"
} }
},
"winery": {
"flake": false,
"locked": {
"lastModified": 1732329065,
"narHash": "sha256-CDFOD4B6cJgAe3ebTVF21/eCJtbHMEzmKF3b/XxMOw4=",
"owner": "fumieval",
"repo": "winery",
"rev": "2d6976bc822f6d2c2d590703b9470cb9e62191a5",
"type": "github"
},
"original": {
"owner": "fumieval",
"repo": "winery",
"type": "github"
}
} }
}, },
"root": "root", "root": "root",

View file

@ -8,19 +8,24 @@
flake-root.url = "github:srid/flake-root"; flake-root.url = "github:srid/flake-root";
treefmt-nix.url = "github:numtide/treefmt-nix"; treefmt-nix.url = "github:numtide/treefmt-nix";
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs"; treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
accelerate = {
url = "github:AccelerateHS/accelerate?submodules=1";
flake = false;
};
llvm-hs = {
url = "github:llvm-hs/llvm-hs?ref=llvm-12";
flake = false;
};
llvm-hs-pure = {
url = "github:llvm-hs/llvm-hs?dir=llvm-hs-pure&ref=llvm-12";
flake = false;
};
mission-control.url = "github:Platonic-Systems/mission-control"; mission-control.url = "github:Platonic-Systems/mission-control";
winery = {
url = "github:fumieval/winery";
flake = false;
};
barbies-th = {
url = "github:fumieval/barbies-th";
flake = false;
};
barbies = {
url = "github:jcpetruzza/barbies";
flake = false;
};
}; };
outputs = inputs: outputs = inputs:
@ -38,7 +43,7 @@
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix # See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
haskellProjects.default = { haskellProjects.default = {
# The base package set (this value is the default) # The base package set (this value is the default)
basePackages = pkgs.haskell.packages.ghc96; # basePackages = pkgs.haskell.packages.ghc910;
# Packages to add on top of `basePackages` # Packages to add on top of `basePackages`
packages = { packages = {
@ -46,30 +51,32 @@
# (Local packages are added automatically) # (Local packages are added automatically)
# https://github.com/lehins/hip.git # https://github.com/lehins/hip.git
# hip.source = inputs.hip + "/hip"; # hip.source = inputs.hip + "/hip";
accelerate.source = inputs.accelerate; winery.source = inputs.winery;
# llvm-hs.source = inputs.llvm-hs; barbies-th.source = inputs.barbies-th;
llvm-hs-pure.source = inputs.llvm-hs-pure; barbies.source = inputs.barbies;
}; };
# Add your package overrides here # Add your package overrides here
settings = { settings = {
uuid.jailbreak = true; uuid.jailbreak = true;
accelerate = # accelerate =
{ # {
jailbreak = true; # jailbreak = true;
broken = false; # broken = false;
}; # };
llvm-hs = # winery.broken = false;
{ winery.jailbreak = true;
jailbreak = true; winery.check = false;
broken = false; barbies-th.broken = false;
}; barbies-th.jailbreak = true;
llvm-hs-pure = barbies.broken = false;
{ barbies.jailbreak = true;
jailbreak = true; barbies.check = false;
broken = false; tree-diff.check = false;
}; # hlint.jailbreak = true;
# ghc-lib-parser.jailbreak = true;
# ghc-lib-parser-ex.jailbreak = true;
# gmp-with-cxx.jailbreak = true;
}; };
# hlint = { # hlint = {
# jailbreak = true; # jailbreak = true;
@ -96,7 +103,7 @@
programs.ormolu.enable = true; programs.ormolu.enable = true;
programs.nixpkgs-fmt.enable = true; programs.nixpkgs-fmt.enable = true;
programs.cabal-fmt.enable = true; programs.cabal-fmt.enable = true;
programs.hlint.enable = false; # programs.hlint.enable = false;
# We use fourmolu # We use fourmolu
programs.ormolu.package = pkgs.haskellPackages.fourmolu; programs.ormolu.package = pkgs.haskellPackages.fourmolu;
@ -110,7 +117,6 @@
hoogle = { hoogle = {
description = "Start Hoogle server for project dependencies"; description = "Start Hoogle server for project dependencies";
exec = '' exec = ''
echo http://127.0.0.1:8888;
hoogle serve -p 8888 --local; hoogle serve -p 8888 --local;
''; '';
category = "Dev Tools"; category = "Dev Tools";
@ -139,6 +145,7 @@
config.treefmt.build.devShell config.treefmt.build.devShell
]; ];
nativeBuildInputs = with pkgs; [ nativeBuildInputs = with pkgs; [
# llvmPackages_16.libllvm
]; ];
}; };
}; };

97
src-exe/Main.hs Normal file
View file

@ -0,0 +1,97 @@
module Main where
import Compress.Huffman
import Compress.Huffman qualified as HT
import Compress.PrefixTree qualified as PT
import Compress.Arithmetic qualified as A
import Data.Bifunctor qualified as Bi
import Data.Bit qualified as B
import Data.Bits (Bits ((.|.)))
import Data.Bits qualified as B
import Data.ByteString qualified as BS
import Data.HuffmanTree
import Data.Proxy qualified as P
import Codec.Winery qualified as C
import Data.Word
import GHC.Generics (Generic)
import Options.Generic qualified as O
import qualified Data.FiniteBit as FB
import qualified Relude.Unsafe as U
import qualified Data.ByteString.Lazy as BSL
-- import Data.Word4 (Word4(Word4))
import Compress.BurrowsWheeler
import qualified Control.Monad.Par as P
import qualified Compress.LengthDistancePairs as LDP
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CLIOpts = CLIOpts
{ task :: CompressOrDecompress,
strategy :: CompressionStrategy
}
deriving (Show, Generic, O.ParseRecord)
-- applyCompressionOptions ::
-- forall a.
-- (Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialise a) =>
-- P.Proxy a ->
-- CLIOpts ->
-- BS.By(++ []) . teString ->
-- BS.ByteString
-- applyCompressionOptions _ (CLIOpts Compress Huffman) f =
-- C.serialise . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
-- applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
-- C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
-- applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
-- handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
-- applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
-- handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
handleError (Right (Just bs)) = bs
handleError _ = []
main :: IO ()
main = do
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
f <- BS.getContents
let f' = BS.unpack f
print . BS.pack . A.decompress . A.compress $ f'
-- print =<< (P.runParIO $ do
-- let fCounts = (toCounts :: [Word8] -> [(Word8, Word64)]) . {- toBurrowsWheeler . -} (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
-- let fAsWords = map fst fCounts
-- -- let compressedUnencoded {- :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) -} = (A.compress fAsWords)
-- -- print . A.talkAboutCompressed $ compressedUnencoded
-- -- let compressed = C.serialise (compressedUnencoded, A.compress . map snd $ fCounts)
-- -- huffmanCompressionRatio' <- P.spawnP $ (compressionRatioFor (Proxy :: Proxy Word8) (P.Proxy :: Proxy Word8) f)
-- -- arithmaticCodingCompressionRatio' <- P.spawnP $ ((fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
-- -- huffmanCompressionRatio <- P.get huffmanCompressionRatio'
-- -- arithmaticCodingCompressionRatio <- P.get arithmaticCodingCompressionRatio'
-- -- let lengthDistancePairsCompressedSize = fromIntegral . BS.length . C.serialise . (LDP.encode :: ByteString -> [LDP.LengthDistancePair Word16]) $ f
-- -- let lengthDistancePairsCompressionRatio :: Double = lengthDistancePairsCompressedSize / (fromIntegral $ BS.length f)
-- pure ()) -- (lengthDistancePairsCompressionRatio))
-- let decompressed = (A.decompress compressedUnencoded)
-- print ("huffman coding", huffmanCompressionRatio)
-- print ("compression ratio (arithmetic coding)", arithmaticCodingCompressionRatio)
-- print ("works?", decompressed == fAsWords)
-- print . take 10 . drop 70 . zip fAsWords $ decompressed
-- print . ("original length", ) . length $ fAsWords
-- let f = "hello tehre"
-- f <- BS.readFile "pg64317.txt"
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
-- print $ BS.length . C.encode $ compressed
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
-- BS.writeFile "outin.txt" decompressed
-- print (decompressed, f)
-- print $ BS.length decompressed
-- print $ BS.length f
-- print (decompressed == f)
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f

View file

@ -1,5 +1,6 @@
-- | https://en.wikipedia.org/wiki/Arithmetic_coding -- | https://en.wikipedia.org/wiki/Arithmetic_coding
module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized) where -- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
module Compress.Arithmetic where
import Data.Bits qualified as B import Data.Bits qualified as B
import Data.Word import Data.Word
@ -16,12 +17,23 @@ import Data.Map.Strict qualified as M
import Data.Maybe qualified as My import Data.Maybe qualified as My
import Data.Ord import Data.Ord
import Data.Ratio import Data.Ratio
import Data.Serialize qualified as C import Codec.Winery qualified as C
import GHC.Generics (Generic) import GHC.Generics (Generic)
import GHC.Natural (Natural) import GHC.Natural (Natural)
import qualified Relude.Unsafe as US import qualified Relude.Unsafe as US
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List) import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Relude.Unsafe as U
import Codec.Winery (WineryRecord)
import Compress.BurrowsWheeler (toBurrowsWheeler)
import qualified Data.Bit as BV
import qualified Data.Vector as V
import qualified Data.Bit as VB
import qualified Data.Vector.Unboxed as VU
import Data.Dirs
-- import qualified Debug.Trace as D
import qualified Data.Proxy as P
import Compress.WordMarkovStats as WM
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes -- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
@ -29,41 +41,52 @@ import qualified Data.ByteString as BS
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums -- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
data WordMarkovStats = WordMarkovStats toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic, C.Serialize)
loc' (WordMarkovStats {..}) = location
toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
where where
sizes = map ((% maxBound') . fromIntegral . snd) xs
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
asFracsOfTotalSum = map fst . US.tail . L.scanl' f (0, 0) $ sizes
maxBound' :: Integer maxBound' :: Integer
maxBound' = fromIntegral (maxBound :: a) maxBound' = fromIntegral (maxBound :: a)
resize xs = map ( / sum' ) xs
where
sum' = sum xs
withBumpedZeroSized :: [Rational]
withBumpedZeroSized withBumpedZeroSized
| numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum = resize
| otherwise = asFracsOfTotalSum . map (max minimumSize)
. resize
. map (fromIntegral . snd)
$ xs
where
-- | we have to round the 0s up to something
-- | we somewhat arbitrarily choose half of the smallest
-- | normally representable amount
minimumSize :: Rational
minimumSize = 1 % (maxBound' * 2)
remainingSpace = 1 - sum asFracsOfTotalSum
numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum) f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
where where
newFrac = currFrac * (1 - runningSum) newFrac = currFrac * (1 - runningSum)
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)] relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)]
relativeCounts m = map (Bi.second asFractionOfSum) m
where
asFractionOfSum = ( % sum') . fromIntegral
sum' = fromIntegral . F.sum . map snd $ m
maxBound' :: forall a b . (Num b, Bounded a, Integral a) => P.Proxy a -> b
maxBound' p = fromIntegral (maxBound :: a)
twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)]
twoByteMarkov xs = twoByteMarkov xs =
M.map sizeAsFraction M.map sizeAsFraction
. M.fromListWith (M.unionWith (+)) . M.fromListWith (M.unionWith (+))
@ -72,77 +95,106 @@ twoByteMarkov xs =
. US.tail . US.tail
$ xs $ xs
where where
-- optimization, changes fractions so they represent the fraction
-- of remaining space in the list taken up instead of total space
sizeAsFraction :: M.Map k Integer -> [(k, b)]
sizeAsFraction m =
zip keys
. map discretizeFraction
. fractionOfRemainingSums
$ counts
where
fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs
asList = L.sortOn (negate . snd) . M.assocs $ m sizeAsFraction :: M.Map k Integer -> [(k, Rational)]
sizeAsFraction = relativeCounts . M.assocs
keys = map fst asList data Chunk a = Chunk
{ location :: SerializedDirs,
start :: a,
length :: Word64
}
deriving (Eq, Ord, Show, Generic, P.NFData)
deriving C.Serialise via C.WineryVariant (Chunk a)
counts = map snd asList -- maxBound = maxBound' (P.Proxy :: Proxy b)
discretizeFraction :: Rational -> b discretizeFraction :: forall b . (Bounded b, Integral b) => Rational -> b
discretizeFraction = floor . fromRational . (* maxBound') discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b)))
where
maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
maxBound' :: Ratio Integer
maxBound' = fromIntegral (maxBound :: b)
type Word4MapSerialized a = M.Map a ([a], CompressWord4List) type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
type Word4Map a = M.Map a [(a, Word4)] type Word4Map a = M.Map a [(a, Word8)]
-- talkAboutCompressed (Compressed {..}) = (
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
newtype Prediction = Prediction {
chances :: [(Word8, Rational)]
}
class CompressionModel a where
createModel :: [Word8] -> a
-- done in reverse
predict :: a -> [Word8] -> Prediction
-- at some point write
-- instance IsoSerialisable a b => C.Serialise b
class C.Serialise b => IsoSerialisable a b where
toSerialisable :: a -> b
fromSerialisable :: b -> a
-- newtype BetterCompressed = BetterCompressed (Compressed Word8 (A.Word4MapSerialized Word8))
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
-- deriving C.Serialise via C.WineryRecord BetterCompressed
data Compressed a mapImplementation = Compressed data Compressed a mapImplementation = Compressed
{ markovs :: mapImplementation, { markovs :: mapImplementation,
chunks :: [Chunk a] chunks :: [Chunk a]
} }
deriving (Eq, Ord, Show, Generic) -- , C.Serialize, P.NFData) deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation)
deriving instance (Ord a, C.Serialize a) => C.Serialize (Compressed a (Word4MapSerialized a))
data Chunk a = Chunk -- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
{ location :: Ratio Integer, -- unserealizeWord4Map = M.map unserializedIndividualMap
start :: a, -- where
length :: Int -- unserializedIndividualMap :: ([a], CompressWord4List) -> [(a, Word4)]
} -- unserializedIndividualMap (bytes, sizes) = zip bytes . fromCompressedWord4List $ sizes
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
unserealizeWord4Map :: Word4MapSerialized a -> Word4Map a -- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
unserealizeWord4Map = M.map (uncurry zip . Bi.second ((++ [maxBound]) . fromCompressedWord4List))
serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
serealizeWord4Map = M.map (Bi.second (toCompressedWord4List . L.init) . L.unzip) -- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
decompress :: decompress ::
forall a. forall a.
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) => (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
Compressed a (Word4MapSerialized a) -> Compressed a (Word4Map a) ->
[a] [a]
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk
$ chunks
where where
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs' rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
markovs' = unserealizeWord4Map markovs markovs' = -- unserealizeWord4Map
markovs
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size)) toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
decompressChunk :: Chunk a -> [a] decompressChunk :: Chunk a -> [a]
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start) decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start)
where where
location' = deserialize location
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) -- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal) decompress' (!loc, !prevWord) = -- traceShow (newSize) $
((loc - ansLoc) / newSize, newVal)
where where
ansLoc :: Ratio Integer ansLoc :: Ratio Integer
newVal :: a newVal :: a
newSize :: Ratio Integer newSize :: Ratio Integer
(!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev) (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord)
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8] sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
sanityCheck = map (sum . M.elems) . M.elems sanityCheck = map (sum . M.elems) . M.elems
@ -156,18 +208,31 @@ chunk chunkSize = chunk'
(xs', xs'') = splitAt chunkSize xs (xs', xs'') = splitAt chunkSize xs
chunkLength :: Int chunkLength :: Int
chunkLength = 8000 chunkLength = 4096
-- runPar = id
-- parMap = map
-- compress :: -- compress ::
-- forall a. -- forall a.
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) => -- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
-- [a] -> -- [a] ->
-- Compressed a (Word4MapSerialized a) -- Compressed a (Word4MapSerialized a)
compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))) $ Compressed (serealizeWord4Map twoByteMarkovs) . P.runPar . P.parMap compressChunk $ unCompressedChunks compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a)
compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
(compressChunk)
$ unCompressedChunks
where where
toCompress = toCompress' -- toBurrowsWheeler toCompress'
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
unCompressedChunks = chunk chunkLength toCompress unCompressedChunks = chunk chunkLength toCompress
twoByteMarkovs = twoByteMarkov toCompress twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress
rings = M.map toRing twoByteMarkovs rings = M.map toRing twoByteMarkovs
@ -175,12 +240,23 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
Chunk Chunk
{ location = shortestLocation endStats, { location = shortestLocation endStats,
start = US.head toCompress, start = US.head toCompress,
length = L.length toCompress length = fromIntegral . L.length $ toCompress
} }
where where
pairs = zip toCompress . US.tail $ toCompress pairs = zip toCompress . US.tail $ toCompress
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size) shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
-- shortestLocation (WordMarkovStats {..})
-- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer
-- | otherwise = simplestBetween location $ upperBound - epsilon
-- where
-- simplestBetweenAnswer = simplestBetween location upperBound
-- upperBound = location + size
-- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size
-- -- | and should make for a fairly quick subtraction
-- epsilon = 1 % (denominator size)
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
@ -194,6 +270,7 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
size = prevSize * nextSize size = prevSize * nextSize
} }
pyramidFold :: (a -> a -> a) -> [a] -> a pyramidFold :: (a -> a -> a) -> [a] -> a
pyramidFold f = pyramid pyramidFold f = pyramid
where where
@ -209,6 +286,7 @@ simplestBetween :: Rational -> Rational -> Rational
simplestBetween x y simplestBetween x y
| x == y = x | x == y = x
| x > 0 = simplestBetween' n d n' d' | x > 0 = simplestBetween' n d n' d'
-- | x > 0 = simplestBetween' n d n' d'
| otherwise = 0 % 1 | otherwise = 0 % 1
where where
n = numerator x n = numerator x
@ -226,3 +304,15 @@ simplestBetween x y
nd'' = simplestBetween' d' r' d r nd'' = simplestBetween' d' r' d r
n'' = numerator nd'' n'' = numerator nd''
d'' = denominator nd'' d'' = denominator nd''
binarySearch x y = binarySearch' 0 1
where
acceptLow = min x y
acceptHigh = max x y
binarySearch' searchSpaceLow searchSpaceHigh
| mid < acceptLow = Higher : binarySearch' mid searchSpaceHigh
| mid > acceptHigh = Lower : binarySearch' searchSpaceLow mid
| otherwise = []
where
mid = (searchSpaceLow + searchSpaceHigh) / 2

View file

@ -0,0 +1,27 @@
-- |
module Compress.BurrowsWheeler where
-- import qualified Data.CircularList as CL
import qualified Relude.Unsafe as U
-- import Compress.BurrowsWheeler (toBurrowsWheeler)
toBurrowsWheeler (x:xs) = map U.head . sortOn (U.tail) . take (pred . length $ xs) . tails $ xs'
where
xs' = toList $ (x :| xs) <> (x :| [])
toCounts :: forall a b . (Eq a, Eq b, Num b, Enum b, Bounded a, Bounded b) => [a] -> [(a, b)]
toCounts = reverse . foldl' f []
where
f [] x = [(x, 0)]
f ((x, count) : xs) newVal
| x == newVal && count == maxBound = (x, 0) : (x, count) : xs
| x == newVal = (x, succ count) : xs
| otherwise = (newVal, 0) : (x, count) : xs
-- toBurrowsWheeler xs = map last . sort . map (findLyndonWord) . take (length xs) . tails $ xs'
-- where
-- xs' = xs ++ xs
-- findLyndonWord (x : xs) = (x :|) . map snd . takeWhile (uncurry (<=)) . zip (x : xs) $ xs

View file

@ -20,7 +20,7 @@ import Data.Bits as B
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Maybe qualified as My import Data.Maybe qualified as My
import Data.PQueue qualified as PQ import Data.PQueue qualified as PQ
import Data.Serialize qualified as C import Codec.Winery qualified as C
import Data.Text qualified as T import Data.Text qualified as T
import Data.Text.IO qualified as TIO import Data.Text.IO qualified as TIO
import Data.Vector.Unboxed qualified as V import Data.Vector.Unboxed qualified as V
@ -30,6 +30,8 @@ import GHC.Generics (Generic)
import Options.Generic qualified as O import Options.Generic qualified as O
import System.Environment qualified as SE import System.Environment qualified as SE
import Data.HuffmanTree import Data.HuffmanTree
import Data.Word4 (CompressWord4List(xs))
import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts)
decompress decompress
:: forall a :: forall a
@ -50,13 +52,27 @@ decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompre
nextLeaf (L : xs) (Node {..}) = nextLeaf xs left nextLeaf (L : xs) (Node {..}) = nextLeaf xs left
nextLeaf (R : xs) (Node {..}) = nextLeaf xs right nextLeaf (R : xs) (Node {..}) = nextLeaf xs right
data BurrowsWheelerCompressed a b = BurrowsWheelerCompressed {
elements :: Compressed SerializedTreeDirs a,
counts :: Compressed SerializedTreeDirs b
}
deriving (Eq, Ord, Generic) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryRecord (BurrowsWheelerCompressed a b)
data Compressed treeDir a = Compressed {
treeDirs :: treeDir,
huffman :: HuffmanTree a
}
deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
deriving C.Serialise via C.WineryRecord (Compressed treeDir a)
compress compress
:: forall a :: forall a
. (Ord a, Integral a, B.FiniteBits a, B.Bits a) . (Ord a, Integral a)
=> BS.ByteString => [a]
-> Maybe (TreeDirs, HuffmanTree a) -> Maybe (Compressed SerializedTreeDirs a)
compress bs = compress dividedByteString =
liftA2 (,) (TreeDirs <$> treeDirections) mergedHuffmanTrees liftA2 Compressed (serializeTreeDirs . TreeDirs <$> treeDirections) mergedHuffmanTrees
where where
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
@ -64,13 +80,19 @@ compress bs =
mergeHuffmanTrees mergeHuffmanTrees
. PQ.fromList . PQ.fromList
. map (uncurry (flip (,)) . Bi.first Leaf) . map (uncurry (flip (,)) . Bi.first Leaf)
. counts . countOccurances
$ dividedByteString $ dividedByteString
treeDirMap :: M.Map a [TreeDir] treeDirMap :: M.Map a [TreeDir]
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
dividedByteString = toWordsList bs compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
compressWithBurrowsWheeler bs
= liftA2 BurrowsWheelerCompressed
(compress . map fst $ withCounts)
(compress . map snd $ withCounts)
where
withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
-- testCompression -- testCompression
-- :: forall a -- :: forall a
@ -100,10 +122,13 @@ compress bs =
-- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a)) -- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a))
-- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode -- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
counts :: (Ord a) => [a] -> [(a, Int)]
counts = M.toList . F.foldl' combiningInsert M.empty -- I've replaced this with a simpler version but it might be slower
where countOccurances :: (Ord a) => [a] -> [(a, Int)]
combiningInsert m key = M.insertWith (+) key 1 m countOccurances = M.toList . M.fromListWith (+) . map (, 1)
-- countOccurances = M.toList . F.foldl' combiningInsert M.empty
-- where
-- combiningInsert m key = M.insertWith (+) key 1 m
divideByteString :: Int -> BS.ByteString -> [BS.ByteString] divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
divideByteString n [] = [] divideByteString n [] = []
@ -113,15 +138,17 @@ divideByteString n bs = x : divideByteString n xs
compressionRatioFor compressionRatioFor
:: forall a :: forall a b
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialize a) . (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
Integral b, Ord b, C.Serialise b, Bounded b)
=> D.Proxy a => D.Proxy a
-> D.Proxy b
-> BS.ByteString -> BS.ByteString
-> Double -> Double
compressionRatioFor proxy bs = compressionRatioFor _ _ bs =
(/ (fromIntegral . BS.length $ bs)) (/ (fromIntegral . BS.length $ bs))
. fromIntegral . fromIntegral
. BS.length . BS.length
. C.encode . C.serialise
. (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) . (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
$ bs $ bs

View file

@ -0,0 +1,97 @@
module Compress.LengthDistancePairs where
import qualified Data.ByteString as BS
import Data.Word
import qualified Data.Foldable as F
import qualified Relude.Unsafe as US
import qualified Debug.Trace as D
import Compress.Arithmetic (maxBound')
import qualified Data.Proxy as P
import qualified Codec.Winery as C
import qualified Relude.Unsafe as U
data LengthDistancePair a = Literal Word8 | Repeat {
distance :: a,
matchLength :: Word8
}
deriving (Eq, Ord, Show, Generic, NFData)
deriving C.Serialise via (C.WineryVariant (LengthDistancePair a))
ldpLength :: LengthDistancePair a -> Int
ldpLength (Literal _) = 1
ldpLength (Repeat {..}) = fromIntegral matchLength
-- | normally, we'd order `previous` in reverse, but finding
-- | the longest-matching-subsequence starting at `next`
-- | doesn't let us.
-- | The linked list dosen't really lend itself to the problem
data Cursor = Cursor {
previous :: [Word8],
next :: [Word8]
} deriving (Eq, Ord, Show, Generic)
toCursor :: BS.ByteString -> Cursor
toCursor bs= Cursor {
previous = [],
next = BS.unpack bs
}
word8Max :: Int
word8Max = fromIntegral (maxBound :: Word8)
cutTo :: Int -> [a] -> [a]
cutTo newLength xs = drop (max 0 . flip (-) newLength . length $ xs) xs
decode :: forall a . Integral a => [LengthDistancePair a] -> [Word8]
decode = foldl' decodeIndividual []
where
decodeIndividual xs (Literal x) = xs ++ [x]
decodeIndividual xs (Repeat {..})
= (++) xs
. take (fromIntegral matchLength)
. drop (F.length xs - fromIntegral distance)
$ xs
encode :: forall a . (Integral a, Bounded a) => [Word8] -> [LengthDistancePair a]
encode xs = encode' xs xs 0
where
encode' :: [Word8] -> [Word8] -> Int -> [LengthDistancePair a]
encode' predictFrom [] n = []
encode' predictFrom next 0 = []
encode' predictFrom next distance = match : encode' (drop (maxLength - newDistance) predictFrom) (drop matchLength next) newDistance
where
(match, matchLength)
= F.maximumBy (comparing snd)
. map (\x -> (x, ldpLength x))
. ( Literal (U.head next) : )
. zipWith Repeat (reverse [1 .. (fromIntegral distance)])
. map lengthMatchWithRepeat . tails
. take distance
$ predictFrom
maxLength = fromIntegral (minBound :: a)
newDistance = max maxLength . (+ matchLength) $ distance
lengthMatchWithRepeat cyclexs' = zipWithCycle' cyclexs' next
where
zipWithCycle' [] ys = zipWithCycle' cyclexs' ys
zipWithCycle' (x:xs) (y:ys) | x == y = 1 + zipWithCycle' xs ys
zipWithCycle' _ _ = 0
-- lengthMatchWithRepeat candidate
-- = genericLength
-- . takeWhile (uncurry (==)) . zip (cycle candidate)
-- $ next
-- fasterEncode bs = encode' start start 0
-- where
-- encode' toPredict predictFrom maxDistance = (take diff previous)
-- start = BS.unpack bs take

View file

@ -17,9 +17,9 @@ import Data.PQueue qualified as PQ
import Debug.Trace qualified as D import Debug.Trace qualified as D
import Debug.Trace qualified as T import Debug.Trace qualified as T
import GHC.Generics import GHC.Generics
import qualified Data.Serialize as C
import Data.FiniteBit import Data.FiniteBit
import qualified Data.FiniteBit as B import qualified Data.FiniteBit as B
import qualified Codec.Winery as C
data Tree a = (Ord a) => data Tree a = (Ord a) =>
Tree Tree
@ -28,7 +28,8 @@ data Tree a = (Ord a) =>
newtype HuffmanPrefixTree a b = HuffmanPrefixTree newtype HuffmanPrefixTree a b = HuffmanPrefixTree
{ inner :: M.Map a (HuffmanTree b) { inner :: M.Map a (HuffmanTree b)
} deriving (Eq, Ord, Show, Generic, C.Serialize) } deriving (Eq, Ord, Show, Generic)
deriving C.Serialise via C.WineryRecord (HuffmanPrefixTree a b)
finiteBitTupleUncons :: finiteBitTupleUncons ::
forall a b. forall a b.
@ -85,7 +86,7 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
tree = toHuffmanTree . nGramCounts $ bs tree = toHuffmanTree . nGramCounts $ bs
treeDirMap :: M.Map a (M.Map b [TreeDir]) treeDirMap :: M.Map a (M.Map b [TreeDir])
treeDirMap = M.map HT.findTreeDirections . Compress.PrefixTree.inner $ tree treeDirMap = M.map HT.findTreeDirections tree.inner
initial :: Maybe a initial :: Maybe a
initial = fst <$> finiteBitUncons bs initial = fst <$> finiteBitUncons bs
@ -96,37 +97,6 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
treeDirsFor :: (a, b) -> [TreeDir] treeDirsFor :: (a, b) -> [TreeDir]
treeDirsFor (a, b) = (treeDirMap M.! a) M.! b treeDirsFor (a, b) = (treeDirMap M.! a) M.! b
-- | all (M.null . children) . M.elems . children $ tree =
-- fmap End
-- . HT.fromList
-- . map (\x -> (prefixCounts x, x))
-- . M.keys
-- . children
-- $ tree
-- | otherwise =
-- Just
-- . Layer
-- . M.mapMaybeWithKey (\key val -> toHuffmanTree' (key : soFar) val)
-- . children
-- $ tree
-- where
-- prefixCounts :: a -> Int
-- prefixCounts x =
-- fromIntegral
-- . sum
-- . M.elems
-- . M.filterWithKey (\key val -> L.isPrefixOf (reverse . (x :) $ soFar) key)
-- $ nGrams
-- toHuffmanTree :: Tree a -> p1 -> HuffmanTree a
-- toHuffmanTree :: forall a . Tree a -> M.Map [a] Word -> HuffmanTree [a]
-- toHuffmanTree (Tree {..}) nGrams soFar | M.size children == 1 = Leaf . map (reverse . (: soFar)) . M.keys $ children
-- toHuffmanTree (Tree {..}) nGrams soFar = Leaf . map (reverse . (: soFar)) . M.keys $ children
-- where
-- sorted = L.sortBy (prefixCounts . fst) . M.toList $ children
nGramCounts :: nGramCounts ::
forall a b. forall a b.
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) => (Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
@ -155,34 +125,5 @@ fromSingleList (x : xs) = Tree . M.singleton x . fromSingleList $ xs
fromList :: (Ord a) => [[a]] -> Tree a fromList :: (Ord a) => [[a]] -> Tree a
fromList = F.foldl' merge Compress.PrefixTree.empty . map fromSingleList fromList = F.foldl' merge Compress.PrefixTree.empty . map fromSingleList
-- insert :: Ord a => Tree a -> [a] -> Tree a
-- insert (Tree {..}) (x:xs) =
merge :: Tree a -> Tree a -> Tree a merge :: Tree a -> Tree a -> Tree a
merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1 merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1
-- deriving instance Eq (Tree a)
-- deriving instance Ord (Tree a)
-- deriving instance (Show a) => Show (Tree a)
-- empty :: (Ord a) => Tree a
-- empty = Tree M.empty
-- fromList :: (Ord a, F.Foldable t) => t [a] -> Tree a
-- fromList = F.foldl' insert empty
-- insert :: Tree a -> [a] -> Tree a
-- insert (Tree {..}) [] = Tree M.empty
-- insert (Tree {..}) (x : xs) =
-- Tree
-- . flip (M.insert x) children
-- . flip insert xs
-- . My.fromMaybe empty
-- . M.lookup x
-- $ children
-- lookup :: (Ord a) => [a] -> Tree a -> Bool
-- lookup [] = const True
-- lookup (x : xs) = maybe False (Compress.PrefixTree.lookup xs) . M.lookup x . children

View file

@ -0,0 +1,8 @@
module Compress.WordMarkovStats where
data WordMarkovStats = WordMarkovStats
{ location :: Ratio Integer,
size :: Ratio Integer
}
deriving (Show, Eq, Ord, Generic)

View file

@ -0,0 +1,62 @@
module Data.ArbitraryPrecisionFloatingPoint where
import Compress.Arithmetic (binarySearch)
import Data.Function.Pointless
data FloatingPoint = FloatingPoint
{ exponent :: Integer,
significand :: Integer
}
deriving (Eq)
instance Num FloatingPoint where
(+) :: FloatingPoint -> FloatingPoint -> FloatingPoint
(+) = onEqualExponent (+)
(*) :: FloatingPoint -> FloatingPoint -> FloatingPoint
(*) n0 n1 = correctExponent $ FloatingPoint (n0.exponent + n1.exponent) (n0.significand * n1.significand)
abs :: FloatingPoint -> FloatingPoint
abs (FloatingPoint {..}) = FloatingPoint exponent (abs significand)
negate :: FloatingPoint -> FloatingPoint
negate (FloatingPoint {..}) = FloatingPoint exponent (negate significand)
fromInteger :: Integer -> FloatingPoint
fromInteger = correctExponent . FloatingPoint 0
signum :: FloatingPoint -> FloatingPoint
signum = FloatingPoint 0 . signum . (.significand)
onEqualExponent :: (Integer -> Integer -> Integer) -> FloatingPoint -> FloatingPoint -> FloatingPoint
onEqualExponent f n0 n1 = correctExponent . FloatingPoint n0'.exponent $ on f (.significand) n0' n1'
where
(n0', n1') = equalizeExponents n0 n1
-- | significand shouldn't have trailing 0s in binary
-- | exponent should be one larger instead
-- | this function corrects until siginicand isn't divisible by 2
correctExponent :: FloatingPoint -> FloatingPoint
correctExponent (FloatingPoint {..})
| significand /= 0 && remainder == 0 = correctExponent $ FloatingPoint (succ exponent) quotient
| otherwise = (FloatingPoint {..})
where
(quotient, remainder) = divMod significand 2
equalizeExponents :: FloatingPoint -> FloatingPoint -> (FloatingPoint, FloatingPoint)
equalizeExponents n0 n1 =
( withDiff n0 exponent0Diff,
withDiff n1 exponent1Diff
)
where
withDiff n exponentDiff = FloatingPoint (n.exponent + exponentDiff) (n.significand * (2 ^ exponentDiff))
signedExponentDiff = n0.exponent - n1.exponent
(exponent0Diff, exponent1Diff)
| signedExponentDiff < 0 = (0, abs signedExponentDiff)
| otherwise = (signedExponentDiff, 0)
instance Ord FloatingPoint where
compare :: FloatingPoint -> FloatingPoint -> Ordering
compare = uncurry (on compare (.significand)) .: equalizeExponents

52
src/Data/Dirs.hs Normal file
View file

@ -0,0 +1,52 @@
-- |
module Data.Dirs where
import qualified Data.ByteString as BS
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector as V
import qualified Data.Bit as VB
import qualified Data.Bit as BV
import Data.Ratio
import Control.Monad.Par qualified as P
import Codec.Winery qualified as C
import qualified Relude as R
data Dir = Higher | Lower deriving Show
data SerializedDirs = SerializedDirs {
bs :: BS.ByteString,
length :: Int
} deriving (Generic, Show, Ord, Eq, P.NFData, C.Serialise)
-- toBS :: [Dir] -> SerializedDirs
serialize dirs = SerializedDirs {..}
where
bs = VB.cloneToByteString . VU.fromList . map dirToBit $ dirs
length = R.length dirs
deserialize :: SerializedDirs -> Rational
deserialize (SerializedDirs {..}) = fromDirs . map bitToDir . VU.toList $ asVec
where
asVec = VU.take length . VB.cloneFromByteString $ bs
addNewBit x Higher = (x * 2) + 1
addNewBit x Lower = x * 2
fromDirs = sum . (0.5 :) . zipWith (*) fractions . map dirToNum
where
addNewBit x Higher = (x * 2) + 1
addNewBit x Lower = x * 2
dirToNum Higher = 1
dirToNum Lower = -1
bitToDir (BV.Bit True ) = Higher
bitToDir (BV.Bit False) = Lower
dirToBit Higher = BV.Bit True
dirToBit Lower = BV.Bit False
fractions :: [Rational]
fractions = iterate ( / 2) (1 % 4)

View file

@ -9,6 +9,9 @@ import Data.Vector.Unboxed qualified as V
import Data.Bit qualified as B import Data.Bit qualified as B
import Data.Bit qualified as BV import Data.Bit qualified as BV
import Data.ByteString qualified as BS import Data.ByteString qualified as BS
import qualified Codec.Winery as C
import Data.FiniteBit (toByteString)
import Codec.Winery (serialise)
data HuffmanTree a data HuffmanTree a
= Leaf a = Leaf a
@ -16,24 +19,29 @@ data HuffmanTree a
{ left :: HuffmanTree a, { left :: HuffmanTree a,
right :: HuffmanTree a right :: HuffmanTree a
} }
deriving (Eq, Ord, Show, Generic, C.Serialize, Functor) deriving (Eq, Ord, Show, Generic, Functor)
deriving C.Serialise via (C.WineryVariant (HuffmanTree a))
-- here so we can define our own Serialize instance -- here so we can define our own Serialize instance
newtype TreeDirs = TreeDirs { newtype TreeDirs = TreeDirs {
inner :: [TreeDir] inner :: [TreeDir]
} deriving (Eq, Ord, Show) } deriving (Eq, Ord, Show)
-- deriving C.Serialize via C.WineryVariant TreeDirs
data TreeDir = L | R deriving (Eq, Ord, Show) data TreeDir = L | R deriving (Eq, Ord, Show)
instance C.Serialize TreeDirs where newtype SerializedTreeDirs = SerializedTreeDirs
put :: C.Putter TreeDirs {
put = C.put . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) . (inner :: TreeDirs -> [TreeDir]) inner :: BS.ByteString
} deriving (Generic, Eq, Ord)
deriving C.Serialise via (C.WineryRecord SerializedTreeDirs)
get :: C.Get TreeDirs serializeTreeDirs (TreeDirs inner) = SerializedTreeDirs . fst . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) $ inner
get = do
(bs, len) <- C.get unserializeTreeDirs (SerializedTreeDirs bs) = TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
pure . TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs where
len = BS.length bs
lookup :: lookup ::
forall a. forall a.

View file

@ -32,10 +32,10 @@ maxView :: PQueue a -> Maybe (Int, (a, PQueue a))
maxView = abstractView IM.lookupMax maxView = abstractView IM.lookupMax
insert :: Int -> p -> PQueue p -> PQueue p insert :: Int -> p -> PQueue p -> PQueue p
insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . toMap insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . (.toMap)
elems :: PQueue a -> [a] elems :: PQueue a -> [a]
elems = concatMap NE.toList . IM.elems . toMap elems = concatMap NE.toList . IM.elems . (.toMap)
keys :: PQueue a -> [IM.Key] keys :: PQueue a -> [IM.Key]
keys = IM.keys . toMap keys = IM.keys . (.toMap)

View file

@ -1,46 +1,62 @@
-- | note to myself. Something's wrong here.
-- | it doesn't work properly when fed into Compress.Arithmetic
module Data.Word4 where module Data.Word4 where
import Data.Bits import Data.Bits
import Data.Word import Data.Word
import qualified Data.Serialize as C import qualified Codec.Winery as C
import qualified Data.Map as M import qualified Data.Map as M
import qualified Text.Show as S
import qualified Relude.Unsafe as U
import Codec.Winery (WineryRecord)
-- only use lower 4 bits -- only use lower 4 bits
newtype Word4 = Word4 { newtype Word4 = Word4 {
inner :: Word8 inner :: Word8
} deriving (Eq, Ord, Show, Enum, Generic, C.Serialize) } deriving (Eq, Ord, Enum, Generic)
deriving C.Serialise via C.WineryRecord Word4
instance S.Show Word4 where
show = show . (.inner)
map2 :: (Word8 -> Word8 -> Word8) -> Word4 -> Word4 -> Word4 map2 :: (Word8 -> Word8 -> Word8) -> Word4 -> Word4 -> Word4
map2 f (Word4 a) (Word4 b) = Word4 $ f a b map2 f (Word4 a) (Word4 b) = Word4 $ f a b
data CompressWord4List = CompressWord4List newtype CompressWord4List = CompressWord4List
{ xs :: [Word8], { xs :: [Word8]
last :: Maybe Word8 -- in lower 4 bits } deriving (Eq, Ord, Show, Generic)
} deriving (Eq, Ord, Generic, Show, C.Serialize, NFData) deriving C.Serialise via C.WineryVariant CompressWord4List
bitwiseAnd a b = getAnd $ And a <> And b bitwiseAnd a b = getAnd $ And a <> And b
bitwiseOr a b = getIor $ Ior a <> Ior b bitwiseOr a b = getIor $ Ior a <> Ior b
toCompressedWord4List [] = CompressWord4List [] Nothing
toCompressedWord4List [Word4 x] = CompressWord4List [] (Just x)
toCompressedWord4List ((Word4 x) : (Word4 x') : xs) =
CompressWord4List
{ xs = headByte : xs',
last = last'
}
where
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
(CompressWord4List xs' last') = toCompressedWord4List xs toCompressedWord4List = toCompressedWord4List' . U.init
where
toCompressedWord4List' [] = CompressWord4List []
toCompressedWord4List' [Word4 x] = CompressWord4List [x .<<. 4]
toCompressedWord4List' ((Word4 x) : (Word4 x') : xs) =
CompressWord4List
{ xs = headByte : xs'
}
where
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
(CompressWord4List xs') = toCompressedWord4List' xs
fromCompressedWord4List :: CompressWord4List -> [Word4] fromCompressedWord4List :: CompressWord4List -> [Word4]
fromCompressedWord4List (CompressWord4List [] Nothing) = [] fromCompressedWord4List = fromCompressedWord4List' . (.xs)
fromCompressedWord4List (CompressWord4List [] (Just a)) = [word4 a] where
fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) = fromCompressedWord4List' :: [Word8] -> [Word4]
Word4 ((bitwiseAnd 0xf0 x) .>>. 4) fromCompressedWord4List' [] = []
: word4 x fromCompressedWord4List' (x : xs) =
: fromCompressedWord4List (CompressWord4List xs last) word4 (x .>>. 4)
: word4 x
: fromCompressedWord4List' xs
-- instance Show Word4 where -- instance Show Word4 where
-- show :: Word4 -> String -- show :: Word4 -> String
@ -48,10 +64,10 @@ fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
instance Bounded Word4 where instance Bounded Word4 where
minBound = word4 0 minBound = word4 0
maxBound = word4 0xf maxBound = word4 15
instance Real Word4 where instance Real Word4 where
toRational = toRational . inner toRational = toRational . (.inner)
instance Integral Word4 where instance Integral Word4 where
quot = map2 quot `on` clean quot = map2 quot `on` clean
@ -66,9 +82,9 @@ instance Integral Word4 where
divMod (Word4 a) (Word4 b) = (word4 a', word4 b') divMod (Word4 a) (Word4 b) = (word4 a', word4 b')
where where
(a', b') = divMod a b (a', b') = divMod a b
toInteger = toInteger . inner . clean toInteger = toInteger . (.inner) . clean
clean (Word4 a) = Word4 . bitwiseAnd 0xf $ a clean (Word4 a) = Word4 . bitwiseAnd 0x0f $ a
word4 = clean . Word4 word4 = clean . Word4
instance Num Word4 where instance Num Word4 where

View file

@ -1,71 +0,0 @@
module Main where
import Compress.Huffman
import Compress.Huffman qualified as HT
import Compress.PrefixTree (HuffmanPrefixTree (HuffmanPrefixTree))
import Compress.PrefixTree qualified as PT
import Compress.Arithmetic qualified as A
import Data.Bifunctor qualified as Bi
import Data.Bit qualified as B
import Data.Bits (Bits ((.|.)))
import Data.Bits qualified as B
import Data.ByteString qualified as BS
import Data.HuffmanTree
import Data.Proxy qualified as P
import Data.Serialize qualified as C
import Data.Word
import GHC.Generics (Generic)
import Options.Generic qualified as O
import qualified Data.FiniteBit as FB
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
data CLIOpts = CLIOpts
{ task :: CompressOrDecompress,
strategy :: CompressionStrategy
}
deriving (Show, Generic, O.ParseRecord)
applyCompressionOptions ::
forall a.
(Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialize a) =>
P.Proxy a ->
CLIOpts ->
BS.ByteString ->
BS.ByteString
applyCompressionOptions _ (CLIOpts Compress Huffman) f =
C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
handleError (Right (Just bs)) = bs
handleError _ = []
main :: IO ()
main = do
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
f <- BS.getContents
let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
let compressedUnencoded :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) = (A.compress fAsWords)
let compressed = C.encode compressedUnencoded
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
-- let f = "hello tehre"
-- f <- BS.readFile "pg64317.txt"
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
-- print $ BS.length . C.encode $ compressed
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
-- BS.writeFile "outin.txt" decompressed
-- print (decompressed, f)
-- print $ BS.length decompressed
-- print $ BS.length f
-- print (decompressed == f)
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f

92
test/Test.hs Normal file
View file

@ -0,0 +1,92 @@
import Compress.Arithmetic (twoByteMarkov)
import Compress.Arithmetic qualified as A
import Compress.LengthDistancePairs as LDP
import Compress.WordMarkovStats as WM
import Data.Bifunctor qualified as Bi
import Data.ByteString qualified as BS
import Data.Dirs as D
import Data.Map.Strict qualified as M
import Data.Ratio
import Test.Falsify.Generator qualified as Gen
import Test.Falsify.Predicate as P
import Test.Falsify.Property qualified as P
import Test.Falsify.Range qualified as Range
import Test.Tasty
import Test.Tasty.Falsify as F
import Test.Tasty.HUnit
-- import qualified GHC.Enum as B
main :: IO ()
main = do
toCompress <- -- BS.take 12000 <$>
BS.readFile "pg64317.txt"
defaultMain (tests toCompress)
tests toCompress =
testGroup
"falsify"
[ testGroup
"Arithmetic"
[ testCase "works" $
A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
@?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
testCase "relativeCounts works as expected with one param" $
A.relativeCounts [(0, 30)]
@?= [(0, 1)],
testCase "relativeCounts works as expected with lots of params" $
A.relativeCounts [(0, 30), (1, 20)]
@?= [(0, 30 % 50), (1, 20 % 50)],
testCase "toRing locs all less than 1" $
assertBool "larger than one" $
all (all ((<= 1) . (.location))) toCompressRing,
testCase "toRing sizes all add up to 1" $
assertBool "larger than 1" $
all ((== 1) . sum . map (.size)) toCompressRing,
testCase "toRing gives no zero sizes" $
assertBool "== 0" $
all (all ((/= 0) . (.size))) toCompressRing,
F.testProperty "binary search" propBinarySearchWithinBounds,
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
]
-- testGroup
-- "LengthDistancePair"
-- [ F.testProperty "compress and decompress isomorphism" $ (propCompressDecompressIsomorphism (LDP.decode . (LDP.encode :: [Word8] -> [LDP.LengthDistancePair Word32])))
-- ]
]
where
toCompressRing =
map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
. M.elems
. A.twoByteMarkov
. BS.unpack
$ toCompress
wordMaxBound :: Integer
wordMaxBound = fromIntegral (maxBound :: Word)
genProperFraction = F.gen . fmap ((% wordMaxBound) . fromIntegral) . Gen.inRange $ (Range.between (0, maxBound :: Word))
-- propBinarySearchWithinBounds :: Property' String ()
propBinarySearchWithinBounds = do
bound1 <- genProperFraction
epsilon' <- genProperFraction
let epsilon = max (1 % (fromIntegral (maxBound :: Word))) . (* 0.1) $ bound1 * epsilon'
let bound2 = if bound1 + epsilon < 1 then bound1 + epsilon else bound1 - epsilon
-- let bound2 = min 1 $ bound1 + epsilon
let iso = D.deserialize . D.serialize $ A.binarySearch bound1 bound2
P.assert $ P.le .$ ("minimum", min bound1 bound2) .$ ("iso", iso)
P.assert $ P.ge .$ ("maximum", max bound1 bound2) .$ ("iso", iso)
-- P.assert $ Range.between (min bound1 bound2) (max bound1 bound2) .$ (A.fromDirs $ A.binarySearch bound1 bound2)
-- propCompressDecompressIsomorphism :: F.Property ()
-- propCompressDecompressIsomorphism :: Property' String ()
propCompressDecompressIsomorphism :: ([Word8] -> [Word8]) -> Property' String ()
propCompressDecompressIsomorphism iso = do
(xs :: [Word8]) <-
F.gen $ Gen.list (Range.between (4, 5000)) intGen
P.assert $ eq .$ ("xs", xs) .$ ("changed", iso $ xs)
where
intGen :: F.Gen Word8
intGen = Gen.inRange $ Range.between (0, fromIntegral (maxBound :: Word8))