benchmark, test suite, and run length encoding
This commit is contained in:
parent
2123636291
commit
da83f9a5d0
19 changed files with 884 additions and 364 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -6,3 +6,4 @@
|
||||||
/cabal.project.local
|
/cabal.project.local
|
||||||
/compress.hp
|
/compress.hp
|
||||||
/*.prof
|
/*.prof
|
||||||
|
/calgarycorpus/
|
||||||
|
|
|
||||||
15
bench/Bench.hs
Normal file
15
bench/Bench.hs
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
import Criterion.Main
|
||||||
|
import Compress.LengthDistancePairs
|
||||||
|
import qualified Data.ByteString as BS
|
||||||
|
import qualified Compress.Arithmetic as A
|
||||||
|
|
||||||
|
main = do
|
||||||
|
testData <- BS.unpack . BS.take 10000 <$> (BS.readFile "pg64317.txt")
|
||||||
|
defaultMain [
|
||||||
|
bgroup "encode" [
|
||||||
|
bench "length distance pair" $ nf (encode :: [Word8] -> [LengthDistancePair Word32]) testData
|
||||||
|
],
|
||||||
|
bgroup "arithmetic coding" [
|
||||||
|
bench "pg64317.txt" $ nf A.compress testData
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
@ -6,23 +6,26 @@ build-type: Simple
|
||||||
license: MIT
|
license: MIT
|
||||||
license-file: LICENSE
|
license-file: LICENSE
|
||||||
|
|
||||||
executable compress
|
common deps
|
||||||
main-is:
|
|
||||||
Main.hs
|
|
||||||
default-extensions:
|
default-extensions:
|
||||||
DataKinds,
|
DataKinds,
|
||||||
DeriveAnyClass,
|
DeriveAnyClass,
|
||||||
DeriveGeneric,
|
DeriveGeneric,
|
||||||
|
DerivingStrategies,
|
||||||
|
DerivingVia,
|
||||||
DuplicateRecordFields,
|
DuplicateRecordFields,
|
||||||
ExtendedDefaultRules,
|
ExtendedDefaultRules,
|
||||||
FlexibleContexts,
|
FlexibleContexts,
|
||||||
FlexibleInstances,
|
FlexibleInstances,
|
||||||
ImpredicativeTypes,
|
ImpredicativeTypes,
|
||||||
InstanceSigs,
|
InstanceSigs,
|
||||||
|
LambdaCase,
|
||||||
MultiParamTypeClasses,
|
MultiParamTypeClasses,
|
||||||
NamedFieldPuns,
|
NamedFieldPuns,
|
||||||
|
NoFieldSelectors,
|
||||||
OverloadedLabels,
|
OverloadedLabels,
|
||||||
OverloadedLists,
|
OverloadedLists,
|
||||||
|
OverloadedRecordDot,
|
||||||
OverloadedStrings,
|
OverloadedStrings,
|
||||||
PartialTypeSignatures,
|
PartialTypeSignatures,
|
||||||
RankNTypes,
|
RankNTypes,
|
||||||
|
|
@ -32,15 +35,12 @@ executable compress
|
||||||
StandaloneDeriving,
|
StandaloneDeriving,
|
||||||
StrictData,
|
StrictData,
|
||||||
TemplateHaskell,
|
TemplateHaskell,
|
||||||
LambdaCase,
|
|
||||||
TupleSections,
|
TupleSections,
|
||||||
TypeApplications,
|
TypeApplications,
|
||||||
TypeFamilies,
|
TypeFamilies,
|
||||||
TypeOperators,
|
TypeOperators,
|
||||||
TypeSynonymInstances,
|
TypeSynonymInstances,
|
||||||
UndecidableInstances,
|
UndecidableInstances,
|
||||||
hs-source-dirs:
|
|
||||||
src
|
|
||||||
mixins:
|
mixins:
|
||||||
base hiding (Prelude),
|
base hiding (Prelude),
|
||||||
relude (Relude as Prelude),
|
relude (Relude as Prelude),
|
||||||
|
|
@ -48,17 +48,21 @@ executable compress
|
||||||
build-depends:
|
build-depends:
|
||||||
base,
|
base,
|
||||||
bitvec,
|
bitvec,
|
||||||
bytestring,
|
|
||||||
cereal,
|
cereal,
|
||||||
|
bytestring,
|
||||||
containers,
|
containers,
|
||||||
|
winery,
|
||||||
|
parsec,
|
||||||
-- accelerate,
|
-- accelerate,
|
||||||
-- containers-accelerate,
|
-- containers-accelerate,
|
||||||
|
parsec,
|
||||||
leancheck,
|
leancheck,
|
||||||
monad-par,
|
monad-par,
|
||||||
monad-par-extras,
|
monad-par-extras,
|
||||||
nonempty-containers,
|
nonempty-containers,
|
||||||
optparse-generic,
|
optparse-generic,
|
||||||
relude,
|
relude,
|
||||||
|
pointless-fun,
|
||||||
text,
|
text,
|
||||||
uuid,
|
uuid,
|
||||||
vector,
|
vector,
|
||||||
|
|
@ -66,15 +70,63 @@ executable compress
|
||||||
default-language:
|
default-language:
|
||||||
GHC2021
|
GHC2021
|
||||||
other-modules:
|
other-modules:
|
||||||
Data.PQueue
|
Compress.Arithmetic
|
||||||
Data.FiniteBit
|
Compress.BurrowsWheeler
|
||||||
Compress.Huffman
|
Compress.Huffman
|
||||||
Compress.PrefixTree
|
Compress.PrefixTree
|
||||||
|
Compress.WordMarkovStats
|
||||||
|
Compress.LengthDistancePairs
|
||||||
|
Data.Dirs
|
||||||
|
Data.FiniteBit
|
||||||
Data.HuffmanTree
|
Data.HuffmanTree
|
||||||
Compress.Arithmetic
|
Data.PQueue
|
||||||
Data.Word4
|
Data.Word4
|
||||||
|
Data.ArbitraryPrecisionFloatingPoint
|
||||||
|
-- Data.CircularList
|
||||||
|
hs-source-dirs:
|
||||||
|
src
|
||||||
ghc-options:
|
ghc-options:
|
||||||
-threaded
|
-threaded
|
||||||
-fprof-auto
|
"-with-rtsopts=-N"
|
||||||
-fprof-late
|
|
||||||
"-with-rtsopts=-p -hc -B -N -qa"
|
|
||||||
|
benchmark bench
|
||||||
|
import: deps
|
||||||
|
type:
|
||||||
|
exitcode-stdio-1.0
|
||||||
|
main-is:
|
||||||
|
Bench.hs
|
||||||
|
build-depends:
|
||||||
|
criterion
|
||||||
|
|
||||||
|
hs-source-dirs:
|
||||||
|
bench
|
||||||
|
ghc-options:
|
||||||
|
"-with-rtsopts=-T -p -hc -B -qa"
|
||||||
|
|
||||||
|
test-suite test
|
||||||
|
import: deps
|
||||||
|
type:
|
||||||
|
exitcode-stdio-1.0
|
||||||
|
main-is:
|
||||||
|
Test.hs
|
||||||
|
build-depends:
|
||||||
|
tasty,
|
||||||
|
tasty-hunit,
|
||||||
|
falsify
|
||||||
|
|
||||||
|
hs-source-dirs:
|
||||||
|
test
|
||||||
|
|
||||||
|
|
||||||
|
executable compress
|
||||||
|
import: deps
|
||||||
|
|
||||||
|
main-is:
|
||||||
|
Main.hs
|
||||||
|
|
||||||
|
hs-source-dirs:
|
||||||
|
src-exe
|
||||||
|
|
||||||
|
|
||||||
|
-- -fllvm
|
||||||
|
|
|
||||||
131
flake.lock
generated
131
flake.lock
generated
|
|
@ -1,18 +1,34 @@
|
||||||
{
|
{
|
||||||
"nodes": {
|
"nodes": {
|
||||||
"accelerate": {
|
"barbies": {
|
||||||
"flake": false,
|
"flake": false,
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1732969010,
|
"lastModified": 1712605099,
|
||||||
"narHash": "sha256-Qrmtrgij2GbklBXUK42Pt6Db8WiGijA5sz5oC5AR72c=",
|
"narHash": "sha256-jDyIDPiGWAw4qLRoYA4p6njANOg4/EOCx0jmFl607IM=",
|
||||||
"owner": "AccelerateHS",
|
"owner": "jcpetruzza",
|
||||||
"repo": "accelerate",
|
"repo": "barbies",
|
||||||
"rev": "02da6161ef143a9886c8bce542cd96029c4f527a",
|
"rev": "856bc3d3cc72a13e95ed495afd15683c45c7cc55",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"owner": "AccelerateHS",
|
"owner": "jcpetruzza",
|
||||||
"repo": "accelerate",
|
"repo": "barbies",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"barbies-th": {
|
||||||
|
"flake": false,
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1730091166,
|
||||||
|
"narHash": "sha256-kc3ObxnhAJW6vgroFIax/qBOrp3HIWoLRtu0jsJUGIE=",
|
||||||
|
"owner": "fumieval",
|
||||||
|
"repo": "barbies-th",
|
||||||
|
"rev": "46c7b8c68634b219ff12e7966983f9b46a5976d4",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "fumieval",
|
||||||
|
"repo": "barbies-th",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
@ -21,11 +37,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1733312601,
|
"lastModified": 1756770412,
|
||||||
"narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=",
|
"narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9",
|
"rev": "4524271976b625a4a605beefd893f270620fd751",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -51,11 +67,11 @@
|
||||||
},
|
},
|
||||||
"haskell-flake": {
|
"haskell-flake": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1734984991,
|
"lastModified": 1756607542,
|
||||||
"narHash": "sha256-oUYtRBD3Yhw2jvKYo0lfd82fgEQQbFoiJcHO923gmOc=",
|
"narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=",
|
||||||
"owner": "srid",
|
"owner": "srid",
|
||||||
"repo": "haskell-flake",
|
"repo": "haskell-flake",
|
||||||
"rev": "daf00052906bdd977e57a07f7048437214232e87",
|
"rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -64,42 +80,6 @@
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"llvm-hs": {
|
|
||||||
"flake": false,
|
|
||||||
"locked": {
|
|
||||||
"lastModified": 1665495332,
|
|
||||||
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
|
|
||||||
"owner": "llvm-hs",
|
|
||||||
"repo": "llvm-hs",
|
|
||||||
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"owner": "llvm-hs",
|
|
||||||
"ref": "llvm-12",
|
|
||||||
"repo": "llvm-hs",
|
|
||||||
"type": "github"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"llvm-hs-pure": {
|
|
||||||
"flake": false,
|
|
||||||
"locked": {
|
|
||||||
"dir": "llvm-hs-pure",
|
|
||||||
"lastModified": 1665495332,
|
|
||||||
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
|
|
||||||
"owner": "llvm-hs",
|
|
||||||
"repo": "llvm-hs",
|
|
||||||
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"dir": "llvm-hs-pure",
|
|
||||||
"owner": "llvm-hs",
|
|
||||||
"ref": "llvm-12",
|
|
||||||
"repo": "llvm-hs",
|
|
||||||
"type": "github"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"mission-control": {
|
"mission-control": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1733438716,
|
"lastModified": 1733438716,
|
||||||
|
|
@ -117,11 +97,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1734649271,
|
"lastModified": 1756542300,
|
||||||
"narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=",
|
"narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=",
|
||||||
"owner": "nixos",
|
"owner": "nixos",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507",
|
"rev": "d7600c775f877cd87b4f5a831c28aa94137377aa",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -133,28 +113,31 @@
|
||||||
},
|
},
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1733096140,
|
"lastModified": 1754788789,
|
||||||
"narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=",
|
"narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=",
|
||||||
"type": "tarball",
|
"owner": "nix-community",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
|
"repo": "nixpkgs.lib",
|
||||||
|
"rev": "a73b9c743612e4244d865a2fdee11865283c04e6",
|
||||||
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"type": "tarball",
|
"owner": "nix-community",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
|
"repo": "nixpkgs.lib",
|
||||||
|
"type": "github"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": {
|
"root": {
|
||||||
"inputs": {
|
"inputs": {
|
||||||
"accelerate": "accelerate",
|
"barbies": "barbies",
|
||||||
|
"barbies-th": "barbies-th",
|
||||||
"flake-parts": "flake-parts",
|
"flake-parts": "flake-parts",
|
||||||
"flake-root": "flake-root",
|
"flake-root": "flake-root",
|
||||||
"haskell-flake": "haskell-flake",
|
"haskell-flake": "haskell-flake",
|
||||||
"llvm-hs": "llvm-hs",
|
|
||||||
"llvm-hs-pure": "llvm-hs-pure",
|
|
||||||
"mission-control": "mission-control",
|
"mission-control": "mission-control",
|
||||||
"nixpkgs": "nixpkgs",
|
"nixpkgs": "nixpkgs",
|
||||||
"systems": "systems",
|
"systems": "systems",
|
||||||
"treefmt-nix": "treefmt-nix"
|
"treefmt-nix": "treefmt-nix",
|
||||||
|
"winery": "winery"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"systems": {
|
"systems": {
|
||||||
|
|
@ -179,11 +162,11 @@
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1734982074,
|
"lastModified": 1756662192,
|
||||||
"narHash": "sha256-N7M37KP7cHWoXicuE536GrVvU8nMDT/gpI1kja2hkdg=",
|
"narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=",
|
||||||
"owner": "numtide",
|
"owner": "numtide",
|
||||||
"repo": "treefmt-nix",
|
"repo": "treefmt-nix",
|
||||||
"rev": "e41e948cf097cbf96ba4dff47a30ea6891af9f33",
|
"rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
@ -191,6 +174,22 @@
|
||||||
"repo": "treefmt-nix",
|
"repo": "treefmt-nix",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"winery": {
|
||||||
|
"flake": false,
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1732329065,
|
||||||
|
"narHash": "sha256-CDFOD4B6cJgAe3ebTVF21/eCJtbHMEzmKF3b/XxMOw4=",
|
||||||
|
"owner": "fumieval",
|
||||||
|
"repo": "winery",
|
||||||
|
"rev": "2d6976bc822f6d2c2d590703b9470cb9e62191a5",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "fumieval",
|
||||||
|
"repo": "winery",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": "root",
|
"root": "root",
|
||||||
|
|
|
||||||
75
flake.nix
75
flake.nix
|
|
@ -8,19 +8,24 @@
|
||||||
flake-root.url = "github:srid/flake-root";
|
flake-root.url = "github:srid/flake-root";
|
||||||
treefmt-nix.url = "github:numtide/treefmt-nix";
|
treefmt-nix.url = "github:numtide/treefmt-nix";
|
||||||
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
|
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
|
||||||
accelerate = {
|
|
||||||
url = "github:AccelerateHS/accelerate?submodules=1";
|
|
||||||
flake = false;
|
|
||||||
};
|
|
||||||
llvm-hs = {
|
|
||||||
url = "github:llvm-hs/llvm-hs?ref=llvm-12";
|
|
||||||
flake = false;
|
|
||||||
};
|
|
||||||
llvm-hs-pure = {
|
|
||||||
url = "github:llvm-hs/llvm-hs?dir=llvm-hs-pure&ref=llvm-12";
|
|
||||||
flake = false;
|
|
||||||
};
|
|
||||||
mission-control.url = "github:Platonic-Systems/mission-control";
|
mission-control.url = "github:Platonic-Systems/mission-control";
|
||||||
|
|
||||||
|
winery = {
|
||||||
|
url = "github:fumieval/winery";
|
||||||
|
flake = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
barbies-th = {
|
||||||
|
url = "github:fumieval/barbies-th";
|
||||||
|
flake = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
barbies = {
|
||||||
|
url = "github:jcpetruzza/barbies";
|
||||||
|
flake = false;
|
||||||
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
outputs = inputs:
|
outputs = inputs:
|
||||||
|
|
@ -38,7 +43,7 @@
|
||||||
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
|
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
|
||||||
haskellProjects.default = {
|
haskellProjects.default = {
|
||||||
# The base package set (this value is the default)
|
# The base package set (this value is the default)
|
||||||
basePackages = pkgs.haskell.packages.ghc96;
|
# basePackages = pkgs.haskell.packages.ghc910;
|
||||||
|
|
||||||
# Packages to add on top of `basePackages`
|
# Packages to add on top of `basePackages`
|
||||||
packages = {
|
packages = {
|
||||||
|
|
@ -46,30 +51,32 @@
|
||||||
# (Local packages are added automatically)
|
# (Local packages are added automatically)
|
||||||
# https://github.com/lehins/hip.git
|
# https://github.com/lehins/hip.git
|
||||||
# hip.source = inputs.hip + "/hip";
|
# hip.source = inputs.hip + "/hip";
|
||||||
accelerate.source = inputs.accelerate;
|
winery.source = inputs.winery;
|
||||||
# llvm-hs.source = inputs.llvm-hs;
|
barbies-th.source = inputs.barbies-th;
|
||||||
llvm-hs-pure.source = inputs.llvm-hs-pure;
|
barbies.source = inputs.barbies;
|
||||||
};
|
};
|
||||||
|
|
||||||
# Add your package overrides here
|
# Add your package overrides here
|
||||||
settings = {
|
settings = {
|
||||||
uuid.jailbreak = true;
|
uuid.jailbreak = true;
|
||||||
accelerate =
|
# accelerate =
|
||||||
{
|
# {
|
||||||
jailbreak = true;
|
# jailbreak = true;
|
||||||
broken = false;
|
# broken = false;
|
||||||
};
|
# };
|
||||||
llvm-hs =
|
# winery.broken = false;
|
||||||
{
|
winery.jailbreak = true;
|
||||||
jailbreak = true;
|
winery.check = false;
|
||||||
broken = false;
|
barbies-th.broken = false;
|
||||||
};
|
barbies-th.jailbreak = true;
|
||||||
llvm-hs-pure =
|
barbies.broken = false;
|
||||||
{
|
barbies.jailbreak = true;
|
||||||
jailbreak = true;
|
barbies.check = false;
|
||||||
broken = false;
|
tree-diff.check = false;
|
||||||
};
|
# hlint.jailbreak = true;
|
||||||
|
# ghc-lib-parser.jailbreak = true;
|
||||||
|
# ghc-lib-parser-ex.jailbreak = true;
|
||||||
|
# gmp-with-cxx.jailbreak = true;
|
||||||
};
|
};
|
||||||
# hlint = {
|
# hlint = {
|
||||||
# jailbreak = true;
|
# jailbreak = true;
|
||||||
|
|
@ -96,7 +103,7 @@
|
||||||
programs.ormolu.enable = true;
|
programs.ormolu.enable = true;
|
||||||
programs.nixpkgs-fmt.enable = true;
|
programs.nixpkgs-fmt.enable = true;
|
||||||
programs.cabal-fmt.enable = true;
|
programs.cabal-fmt.enable = true;
|
||||||
programs.hlint.enable = false;
|
# programs.hlint.enable = false;
|
||||||
|
|
||||||
# We use fourmolu
|
# We use fourmolu
|
||||||
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
|
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
|
||||||
|
|
@ -110,7 +117,6 @@
|
||||||
hoogle = {
|
hoogle = {
|
||||||
description = "Start Hoogle server for project dependencies";
|
description = "Start Hoogle server for project dependencies";
|
||||||
exec = ''
|
exec = ''
|
||||||
echo http://127.0.0.1:8888;
|
|
||||||
hoogle serve -p 8888 --local;
|
hoogle serve -p 8888 --local;
|
||||||
'';
|
'';
|
||||||
category = "Dev Tools";
|
category = "Dev Tools";
|
||||||
|
|
@ -139,6 +145,7 @@
|
||||||
config.treefmt.build.devShell
|
config.treefmt.build.devShell
|
||||||
];
|
];
|
||||||
nativeBuildInputs = with pkgs; [
|
nativeBuildInputs = with pkgs; [
|
||||||
|
# llvmPackages_16.libllvm
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
||||||
97
src-exe/Main.hs
Normal file
97
src-exe/Main.hs
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
import Compress.Huffman
|
||||||
|
import Compress.Huffman qualified as HT
|
||||||
|
import Compress.PrefixTree qualified as PT
|
||||||
|
import Compress.Arithmetic qualified as A
|
||||||
|
import Data.Bifunctor qualified as Bi
|
||||||
|
import Data.Bit qualified as B
|
||||||
|
import Data.Bits (Bits ((.|.)))
|
||||||
|
import Data.Bits qualified as B
|
||||||
|
import Data.ByteString qualified as BS
|
||||||
|
import Data.HuffmanTree
|
||||||
|
import Data.Proxy qualified as P
|
||||||
|
import Codec.Winery qualified as C
|
||||||
|
import Data.Word
|
||||||
|
import GHC.Generics (Generic)
|
||||||
|
import Options.Generic qualified as O
|
||||||
|
import qualified Data.FiniteBit as FB
|
||||||
|
import qualified Relude.Unsafe as U
|
||||||
|
import qualified Data.ByteString.Lazy as BSL
|
||||||
|
-- import Data.Word4 (Word4(Word4))
|
||||||
|
import Compress.BurrowsWheeler
|
||||||
|
import qualified Control.Monad.Par as P
|
||||||
|
import qualified Compress.LengthDistancePairs as LDP
|
||||||
|
|
||||||
|
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
||||||
|
|
||||||
|
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
||||||
|
|
||||||
|
data CLIOpts = CLIOpts
|
||||||
|
{ task :: CompressOrDecompress,
|
||||||
|
strategy :: CompressionStrategy
|
||||||
|
}
|
||||||
|
deriving (Show, Generic, O.ParseRecord)
|
||||||
|
|
||||||
|
-- applyCompressionOptions ::
|
||||||
|
-- forall a.
|
||||||
|
-- (Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialise a) =>
|
||||||
|
-- P.Proxy a ->
|
||||||
|
-- CLIOpts ->
|
||||||
|
-- BS.By(++ []) . teString ->
|
||||||
|
-- BS.ByteString
|
||||||
|
-- applyCompressionOptions _ (CLIOpts Compress Huffman) f =
|
||||||
|
-- C.serialise . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
|
||||||
|
-- applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
|
||||||
|
-- C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
||||||
|
-- applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
|
||||||
|
-- handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
|
||||||
|
-- applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
|
||||||
|
-- handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
||||||
|
|
||||||
|
handleError (Right (Just bs)) = bs
|
||||||
|
handleError _ = []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
|
||||||
|
f <- BS.getContents
|
||||||
|
let f' = BS.unpack f
|
||||||
|
print . BS.pack . A.decompress . A.compress $ f'
|
||||||
|
-- print =<< (P.runParIO $ do
|
||||||
|
-- let fCounts = (toCounts :: [Word8] -> [(Word8, Word64)]) . {- toBurrowsWheeler . -} (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
|
||||||
|
-- let fAsWords = map fst fCounts
|
||||||
|
-- -- let compressedUnencoded {- :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) -} = (A.compress fAsWords)
|
||||||
|
-- -- print . A.talkAboutCompressed $ compressedUnencoded
|
||||||
|
-- -- let compressed = C.serialise (compressedUnencoded, A.compress . map snd $ fCounts)
|
||||||
|
-- -- huffmanCompressionRatio' <- P.spawnP $ (compressionRatioFor (Proxy :: Proxy Word8) (P.Proxy :: Proxy Word8) f)
|
||||||
|
-- -- arithmaticCodingCompressionRatio' <- P.spawnP $ ((fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
|
||||||
|
-- -- huffmanCompressionRatio <- P.get huffmanCompressionRatio'
|
||||||
|
-- -- arithmaticCodingCompressionRatio <- P.get arithmaticCodingCompressionRatio'
|
||||||
|
-- -- let lengthDistancePairsCompressedSize = fromIntegral . BS.length . C.serialise . (LDP.encode :: ByteString -> [LDP.LengthDistancePair Word16]) $ f
|
||||||
|
-- -- let lengthDistancePairsCompressionRatio :: Double = lengthDistancePairsCompressedSize / (fromIntegral $ BS.length f)
|
||||||
|
-- pure ()) -- (lengthDistancePairsCompressionRatio))
|
||||||
|
-- let decompressed = (A.decompress compressedUnencoded)
|
||||||
|
-- print ("huffman coding", huffmanCompressionRatio)
|
||||||
|
-- print ("compression ratio (arithmetic coding)", arithmaticCodingCompressionRatio)
|
||||||
|
-- print ("works?", decompressed == fAsWords)
|
||||||
|
|
||||||
|
|
||||||
|
-- print . take 10 . drop 70 . zip fAsWords $ decompressed
|
||||||
|
|
||||||
|
-- print . ("original length", ) . length $ fAsWords
|
||||||
|
|
||||||
|
-- let f = "hello tehre"
|
||||||
|
-- f <- BS.readFile "pg64317.txt"
|
||||||
|
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
|
||||||
|
-- print $ BS.length . C.encode $ compressed
|
||||||
|
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
|
||||||
|
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
|
||||||
|
-- BS.writeFile "outin.txt" decompressed
|
||||||
|
-- print (decompressed, f)
|
||||||
|
-- print $ BS.length decompressed
|
||||||
|
-- print $ BS.length f
|
||||||
|
-- print (decompressed == f)
|
||||||
|
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
-- | https://en.wikipedia.org/wiki/Arithmetic_coding
|
-- | https://en.wikipedia.org/wiki/Arithmetic_coding
|
||||||
module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized) where
|
-- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
|
||||||
|
module Compress.Arithmetic where
|
||||||
|
|
||||||
import Data.Bits qualified as B
|
import Data.Bits qualified as B
|
||||||
import Data.Word
|
import Data.Word
|
||||||
|
|
@ -16,12 +17,23 @@ import Data.Map.Strict qualified as M
|
||||||
import Data.Maybe qualified as My
|
import Data.Maybe qualified as My
|
||||||
import Data.Ord
|
import Data.Ord
|
||||||
import Data.Ratio
|
import Data.Ratio
|
||||||
import Data.Serialize qualified as C
|
import Codec.Winery qualified as C
|
||||||
import GHC.Generics (Generic)
|
import GHC.Generics (Generic)
|
||||||
import GHC.Natural (Natural)
|
import GHC.Natural (Natural)
|
||||||
import qualified Relude.Unsafe as US
|
import qualified Relude.Unsafe as US
|
||||||
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
|
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
|
||||||
import qualified Data.ByteString as BS
|
import qualified Data.ByteString as BS
|
||||||
|
import qualified Relude.Unsafe as U
|
||||||
|
import Codec.Winery (WineryRecord)
|
||||||
|
import Compress.BurrowsWheeler (toBurrowsWheeler)
|
||||||
|
import qualified Data.Bit as BV
|
||||||
|
import qualified Data.Vector as V
|
||||||
|
import qualified Data.Bit as VB
|
||||||
|
import qualified Data.Vector.Unboxed as VU
|
||||||
|
import Data.Dirs
|
||||||
|
-- import qualified Debug.Trace as D
|
||||||
|
import qualified Data.Proxy as P
|
||||||
|
import Compress.WordMarkovStats as WM
|
||||||
|
|
||||||
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
||||||
|
|
||||||
|
|
@ -29,41 +41,52 @@ import qualified Data.ByteString as BS
|
||||||
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
|
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
|
||||||
|
|
||||||
|
|
||||||
data WordMarkovStats = WordMarkovStats
|
toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
||||||
{ location :: Ratio Integer,
|
|
||||||
size :: Ratio Integer
|
|
||||||
|
|
||||||
}
|
|
||||||
deriving (Show, Eq, Ord, Generic, C.Serialize)
|
|
||||||
|
|
||||||
|
|
||||||
loc' (WordMarkovStats {..}) = location
|
|
||||||
|
|
||||||
toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
|
||||||
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
|
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
|
||||||
where
|
where
|
||||||
sizes = map ((% maxBound') . fromIntegral . snd) xs
|
|
||||||
|
|
||||||
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
|
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
|
||||||
|
|
||||||
asFracsOfTotalSum = map fst . US.tail . L.scanl' f (0, 0) $ sizes
|
|
||||||
|
|
||||||
maxBound' :: Integer
|
maxBound' :: Integer
|
||||||
maxBound' = fromIntegral (maxBound :: a)
|
maxBound' = fromIntegral (maxBound :: a)
|
||||||
|
|
||||||
|
resize xs = map ( / sum' ) xs
|
||||||
|
where
|
||||||
|
sum' = sum xs
|
||||||
|
|
||||||
|
withBumpedZeroSized :: [Rational]
|
||||||
withBumpedZeroSized
|
withBumpedZeroSized
|
||||||
| numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum
|
= resize
|
||||||
| otherwise = asFracsOfTotalSum
|
. map (max minimumSize)
|
||||||
|
. resize
|
||||||
|
. map (fromIntegral . snd)
|
||||||
|
$ xs
|
||||||
|
where
|
||||||
|
-- | we have to round the 0s up to something
|
||||||
|
-- | we somewhat arbitrarily choose half of the smallest
|
||||||
|
-- | normally representable amount
|
||||||
|
minimumSize :: Rational
|
||||||
|
minimumSize = 1 % (maxBound' * 2)
|
||||||
|
|
||||||
remainingSpace = 1 - sum asFracsOfTotalSum
|
|
||||||
|
|
||||||
numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
|
|
||||||
|
|
||||||
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
|
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
|
||||||
where
|
where
|
||||||
newFrac = currFrac * (1 - runningSum)
|
newFrac = currFrac * (1 - runningSum)
|
||||||
|
|
||||||
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)]
|
relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)]
|
||||||
|
relativeCounts m = map (Bi.second asFractionOfSum) m
|
||||||
|
where
|
||||||
|
asFractionOfSum = ( % sum') . fromIntegral
|
||||||
|
|
||||||
|
sum' = fromIntegral . F.sum . map snd $ m
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
maxBound' :: forall a b . (Num b, Bounded a, Integral a) => P.Proxy a -> b
|
||||||
|
maxBound' p = fromIntegral (maxBound :: a)
|
||||||
|
|
||||||
|
twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)]
|
||||||
twoByteMarkov xs =
|
twoByteMarkov xs =
|
||||||
M.map sizeAsFraction
|
M.map sizeAsFraction
|
||||||
. M.fromListWith (M.unionWith (+))
|
. M.fromListWith (M.unionWith (+))
|
||||||
|
|
@ -72,77 +95,106 @@ twoByteMarkov xs =
|
||||||
. US.tail
|
. US.tail
|
||||||
$ xs
|
$ xs
|
||||||
where
|
where
|
||||||
-- optimization, changes fractions so they represent the fraction
|
|
||||||
-- of remaining space in the list taken up instead of total space
|
sizeAsFraction :: M.Map k Integer -> [(k, Rational)]
|
||||||
sizeAsFraction :: M.Map k Integer -> [(k, b)]
|
sizeAsFraction = relativeCounts . M.assocs
|
||||||
sizeAsFraction m =
|
|
||||||
zip keys
|
data Chunk a = Chunk
|
||||||
. map discretizeFraction
|
{ location :: SerializedDirs,
|
||||||
. fractionOfRemainingSums
|
start :: a,
|
||||||
$ counts
|
length :: Word64
|
||||||
|
}
|
||||||
|
deriving (Eq, Ord, Show, Generic, P.NFData)
|
||||||
|
deriving C.Serialise via C.WineryVariant (Chunk a)
|
||||||
|
|
||||||
|
-- maxBound = maxBound' (P.Proxy :: Proxy b)
|
||||||
|
|
||||||
|
discretizeFraction :: forall b . (Bounded b, Integral b) => Rational -> b
|
||||||
|
discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b)))
|
||||||
where
|
where
|
||||||
fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs
|
maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
|
||||||
|
|
||||||
asList = L.sortOn (negate . snd) . M.assocs $ m
|
|
||||||
|
|
||||||
keys = map fst asList
|
|
||||||
|
|
||||||
counts = map snd asList
|
|
||||||
|
|
||||||
discretizeFraction :: Rational -> b
|
|
||||||
discretizeFraction = floor . fromRational . (* maxBound')
|
|
||||||
|
|
||||||
maxBound' :: Ratio Integer
|
|
||||||
maxBound' = fromIntegral (maxBound :: b)
|
|
||||||
|
|
||||||
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
|
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
|
||||||
type Word4Map a = M.Map a [(a, Word4)]
|
type Word4Map a = M.Map a [(a, Word8)]
|
||||||
|
|
||||||
|
|
||||||
|
-- talkAboutCompressed (Compressed {..}) = (
|
||||||
|
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
|
||||||
|
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
|
||||||
|
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
|
||||||
|
|
||||||
|
newtype Prediction = Prediction {
|
||||||
|
chances :: [(Word8, Rational)]
|
||||||
|
}
|
||||||
|
|
||||||
|
class CompressionModel a where
|
||||||
|
createModel :: [Word8] -> a
|
||||||
|
|
||||||
|
-- done in reverse
|
||||||
|
predict :: a -> [Word8] -> Prediction
|
||||||
|
|
||||||
|
-- at some point write
|
||||||
|
-- instance IsoSerialisable a b => C.Serialise b
|
||||||
|
class C.Serialise b => IsoSerialisable a b where
|
||||||
|
toSerialisable :: a -> b
|
||||||
|
|
||||||
|
fromSerialisable :: b -> a
|
||||||
|
|
||||||
|
-- newtype BetterCompressed = BetterCompressed (Compressed Word8 (A.Word4MapSerialized Word8))
|
||||||
|
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
|
||||||
|
-- deriving C.Serialise via C.WineryRecord BetterCompressed
|
||||||
|
|
||||||
data Compressed a mapImplementation = Compressed
|
data Compressed a mapImplementation = Compressed
|
||||||
{ markovs :: mapImplementation,
|
{ markovs :: mapImplementation,
|
||||||
chunks :: [Chunk a]
|
chunks :: [Chunk a]
|
||||||
}
|
}
|
||||||
deriving (Eq, Ord, Show, Generic) -- , C.Serialize, P.NFData)
|
deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
|
||||||
|
deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation)
|
||||||
|
|
||||||
|
|
||||||
deriving instance (Ord a, C.Serialize a) => C.Serialize (Compressed a (Word4MapSerialized a))
|
|
||||||
|
|
||||||
data Chunk a = Chunk
|
-- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
|
||||||
{ location :: Ratio Integer,
|
-- unserealizeWord4Map = M.map unserializedIndividualMap
|
||||||
start :: a,
|
-- where
|
||||||
length :: Int
|
-- unserializedIndividualMap :: ([a], CompressWord4List) -> [(a, Word4)]
|
||||||
}
|
-- unserializedIndividualMap (bytes, sizes) = zip bytes . fromCompressedWord4List $ sizes
|
||||||
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
|
|
||||||
|
|
||||||
unserealizeWord4Map :: Word4MapSerialized a -> Word4Map a
|
-- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
|
||||||
unserealizeWord4Map = M.map (uncurry zip . Bi.second ((++ [maxBound]) . fromCompressedWord4List))
|
|
||||||
|
|
||||||
serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
|
|
||||||
serealizeWord4Map = M.map (Bi.second (toCompressedWord4List . L.init) . L.unzip)
|
-- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
|
||||||
|
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
|
||||||
|
|
||||||
decompress ::
|
decompress ::
|
||||||
forall a.
|
forall a.
|
||||||
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||||
Compressed a (Word4MapSerialized a) ->
|
Compressed a (Word4Map a) ->
|
||||||
[a]
|
[a]
|
||||||
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
|
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk
|
||||||
|
$ chunks
|
||||||
where
|
where
|
||||||
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
|
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
|
||||||
|
|
||||||
markovs' = unserealizeWord4Map markovs
|
markovs' = -- unserealizeWord4Map
|
||||||
|
markovs
|
||||||
|
|
||||||
|
|
||||||
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
||||||
|
|
||||||
decompressChunk :: Chunk a -> [a]
|
decompressChunk :: Chunk a -> [a]
|
||||||
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
|
decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start)
|
||||||
where
|
where
|
||||||
|
|
||||||
|
location' = deserialize location
|
||||||
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
||||||
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
|
decompress' (!loc, !prevWord) = -- traceShow (newSize) $
|
||||||
|
((loc - ansLoc) / newSize, newVal)
|
||||||
where
|
where
|
||||||
ansLoc :: Ratio Integer
|
ansLoc :: Ratio Integer
|
||||||
newVal :: a
|
newVal :: a
|
||||||
newSize :: Ratio Integer
|
newSize :: Ratio Integer
|
||||||
(!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
|
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord)
|
||||||
|
|
||||||
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
||||||
sanityCheck = map (sum . M.elems) . M.elems
|
sanityCheck = map (sum . M.elems) . M.elems
|
||||||
|
|
@ -156,18 +208,31 @@ chunk chunkSize = chunk'
|
||||||
(xs', xs'') = splitAt chunkSize xs
|
(xs', xs'') = splitAt chunkSize xs
|
||||||
|
|
||||||
chunkLength :: Int
|
chunkLength :: Int
|
||||||
chunkLength = 8000
|
chunkLength = 4096
|
||||||
|
|
||||||
|
|
||||||
|
-- runPar = id
|
||||||
|
-- parMap = map
|
||||||
|
|
||||||
-- compress ::
|
-- compress ::
|
||||||
-- forall a.
|
-- forall a.
|
||||||
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||||
-- [a] ->
|
-- [a] ->
|
||||||
-- Compressed a (Word4MapSerialized a)
|
-- Compressed a (Word4MapSerialized a)
|
||||||
compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))) $ Compressed (serealizeWord4Map twoByteMarkovs) . P.runPar . P.parMap compressChunk $ unCompressedChunks
|
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a)
|
||||||
|
compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
|
||||||
|
(compressChunk)
|
||||||
|
$ unCompressedChunks
|
||||||
where
|
where
|
||||||
|
|
||||||
|
toCompress = toCompress' -- toBurrowsWheeler toCompress'
|
||||||
|
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
|
||||||
|
|
||||||
|
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
|
||||||
|
|
||||||
unCompressedChunks = chunk chunkLength toCompress
|
unCompressedChunks = chunk chunkLength toCompress
|
||||||
|
|
||||||
twoByteMarkovs = twoByteMarkov toCompress
|
twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress
|
||||||
|
|
||||||
rings = M.map toRing twoByteMarkovs
|
rings = M.map toRing twoByteMarkovs
|
||||||
|
|
||||||
|
|
@ -175,12 +240,23 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
|
||||||
Chunk
|
Chunk
|
||||||
{ location = shortestLocation endStats,
|
{ location = shortestLocation endStats,
|
||||||
start = US.head toCompress,
|
start = US.head toCompress,
|
||||||
length = L.length toCompress
|
length = fromIntegral . L.length $ toCompress
|
||||||
}
|
}
|
||||||
where
|
where
|
||||||
pairs = zip toCompress . US.tail $ toCompress
|
pairs = zip toCompress . US.tail $ toCompress
|
||||||
|
|
||||||
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
|
shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
|
||||||
|
-- shortestLocation (WordMarkovStats {..})
|
||||||
|
-- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer
|
||||||
|
-- | otherwise = simplestBetween location $ upperBound - epsilon
|
||||||
|
-- where
|
||||||
|
-- simplestBetweenAnswer = simplestBetween location upperBound
|
||||||
|
|
||||||
|
-- upperBound = location + size
|
||||||
|
|
||||||
|
-- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size
|
||||||
|
-- -- | and should make for a fairly quick subtraction
|
||||||
|
-- epsilon = 1 % (denominator size)
|
||||||
|
|
||||||
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
|
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
|
||||||
|
|
||||||
|
|
@ -194,6 +270,7 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
|
||||||
size = prevSize * nextSize
|
size = prevSize * nextSize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pyramidFold :: (a -> a -> a) -> [a] -> a
|
pyramidFold :: (a -> a -> a) -> [a] -> a
|
||||||
pyramidFold f = pyramid
|
pyramidFold f = pyramid
|
||||||
where
|
where
|
||||||
|
|
@ -209,6 +286,7 @@ simplestBetween :: Rational -> Rational -> Rational
|
||||||
simplestBetween x y
|
simplestBetween x y
|
||||||
| x == y = x
|
| x == y = x
|
||||||
| x > 0 = simplestBetween' n d n' d'
|
| x > 0 = simplestBetween' n d n' d'
|
||||||
|
-- | x > 0 = simplestBetween' n d n' d'
|
||||||
| otherwise = 0 % 1
|
| otherwise = 0 % 1
|
||||||
where
|
where
|
||||||
n = numerator x
|
n = numerator x
|
||||||
|
|
@ -226,3 +304,15 @@ simplestBetween x y
|
||||||
nd'' = simplestBetween' d' r' d r
|
nd'' = simplestBetween' d' r' d r
|
||||||
n'' = numerator nd''
|
n'' = numerator nd''
|
||||||
d'' = denominator nd''
|
d'' = denominator nd''
|
||||||
|
|
||||||
|
binarySearch x y = binarySearch' 0 1
|
||||||
|
where
|
||||||
|
acceptLow = min x y
|
||||||
|
acceptHigh = max x y
|
||||||
|
|
||||||
|
binarySearch' searchSpaceLow searchSpaceHigh
|
||||||
|
| mid < acceptLow = Higher : binarySearch' mid searchSpaceHigh
|
||||||
|
| mid > acceptHigh = Lower : binarySearch' searchSpaceLow mid
|
||||||
|
| otherwise = []
|
||||||
|
where
|
||||||
|
mid = (searchSpaceLow + searchSpaceHigh) / 2
|
||||||
|
|
|
||||||
27
src/Compress/BurrowsWheeler.hs
Normal file
27
src/Compress/BurrowsWheeler.hs
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
-- |
|
||||||
|
|
||||||
|
module Compress.BurrowsWheeler where
|
||||||
|
-- import qualified Data.CircularList as CL
|
||||||
|
import qualified Relude.Unsafe as U
|
||||||
|
-- import Compress.BurrowsWheeler (toBurrowsWheeler)
|
||||||
|
|
||||||
|
|
||||||
|
toBurrowsWheeler (x:xs) = map U.head . sortOn (U.tail) . take (pred . length $ xs) . tails $ xs'
|
||||||
|
where
|
||||||
|
xs' = toList $ (x :| xs) <> (x :| [])
|
||||||
|
|
||||||
|
toCounts :: forall a b . (Eq a, Eq b, Num b, Enum b, Bounded a, Bounded b) => [a] -> [(a, b)]
|
||||||
|
toCounts = reverse . foldl' f []
|
||||||
|
where
|
||||||
|
f [] x = [(x, 0)]
|
||||||
|
f ((x, count) : xs) newVal
|
||||||
|
| x == newVal && count == maxBound = (x, 0) : (x, count) : xs
|
||||||
|
| x == newVal = (x, succ count) : xs
|
||||||
|
| otherwise = (newVal, 0) : (x, count) : xs
|
||||||
|
|
||||||
|
|
||||||
|
-- toBurrowsWheeler xs = map last . sort . map (findLyndonWord) . take (length xs) . tails $ xs'
|
||||||
|
-- where
|
||||||
|
-- xs' = xs ++ xs
|
||||||
|
|
||||||
|
-- findLyndonWord (x : xs) = (x :|) . map snd . takeWhile (uncurry (<=)) . zip (x : xs) $ xs
|
||||||
|
|
@ -20,7 +20,7 @@ import Data.Bits as B
|
||||||
import Data.Maybe (fromMaybe)
|
import Data.Maybe (fromMaybe)
|
||||||
import Data.Maybe qualified as My
|
import Data.Maybe qualified as My
|
||||||
import Data.PQueue qualified as PQ
|
import Data.PQueue qualified as PQ
|
||||||
import Data.Serialize qualified as C
|
import Codec.Winery qualified as C
|
||||||
import Data.Text qualified as T
|
import Data.Text qualified as T
|
||||||
import Data.Text.IO qualified as TIO
|
import Data.Text.IO qualified as TIO
|
||||||
import Data.Vector.Unboxed qualified as V
|
import Data.Vector.Unboxed qualified as V
|
||||||
|
|
@ -30,6 +30,8 @@ import GHC.Generics (Generic)
|
||||||
import Options.Generic qualified as O
|
import Options.Generic qualified as O
|
||||||
import System.Environment qualified as SE
|
import System.Environment qualified as SE
|
||||||
import Data.HuffmanTree
|
import Data.HuffmanTree
|
||||||
|
import Data.Word4 (CompressWord4List(xs))
|
||||||
|
import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts)
|
||||||
|
|
||||||
decompress
|
decompress
|
||||||
:: forall a
|
:: forall a
|
||||||
|
|
@ -50,13 +52,27 @@ decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompre
|
||||||
nextLeaf (L : xs) (Node {..}) = nextLeaf xs left
|
nextLeaf (L : xs) (Node {..}) = nextLeaf xs left
|
||||||
nextLeaf (R : xs) (Node {..}) = nextLeaf xs right
|
nextLeaf (R : xs) (Node {..}) = nextLeaf xs right
|
||||||
|
|
||||||
|
data BurrowsWheelerCompressed a b = BurrowsWheelerCompressed {
|
||||||
|
elements :: Compressed SerializedTreeDirs a,
|
||||||
|
counts :: Compressed SerializedTreeDirs b
|
||||||
|
}
|
||||||
|
deriving (Eq, Ord, Generic) -- , C.Serialise, P.NFData)
|
||||||
|
deriving C.Serialise via C.WineryRecord (BurrowsWheelerCompressed a b)
|
||||||
|
|
||||||
|
data Compressed treeDir a = Compressed {
|
||||||
|
treeDirs :: treeDir,
|
||||||
|
huffman :: HuffmanTree a
|
||||||
|
}
|
||||||
|
deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
|
||||||
|
deriving C.Serialise via C.WineryRecord (Compressed treeDir a)
|
||||||
|
|
||||||
compress
|
compress
|
||||||
:: forall a
|
:: forall a
|
||||||
. (Ord a, Integral a, B.FiniteBits a, B.Bits a)
|
. (Ord a, Integral a)
|
||||||
=> BS.ByteString
|
=> [a]
|
||||||
-> Maybe (TreeDirs, HuffmanTree a)
|
-> Maybe (Compressed SerializedTreeDirs a)
|
||||||
compress bs =
|
compress dividedByteString =
|
||||||
liftA2 (,) (TreeDirs <$> treeDirections) mergedHuffmanTrees
|
liftA2 Compressed (serializeTreeDirs . TreeDirs <$> treeDirections) mergedHuffmanTrees
|
||||||
where
|
where
|
||||||
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
|
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
|
||||||
|
|
||||||
|
|
@ -64,13 +80,19 @@ compress bs =
|
||||||
mergeHuffmanTrees
|
mergeHuffmanTrees
|
||||||
. PQ.fromList
|
. PQ.fromList
|
||||||
. map (uncurry (flip (,)) . Bi.first Leaf)
|
. map (uncurry (flip (,)) . Bi.first Leaf)
|
||||||
. counts
|
. countOccurances
|
||||||
$ dividedByteString
|
$ dividedByteString
|
||||||
|
|
||||||
treeDirMap :: M.Map a [TreeDir]
|
treeDirMap :: M.Map a [TreeDir]
|
||||||
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
|
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
|
||||||
|
|
||||||
dividedByteString = toWordsList bs
|
compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
|
||||||
|
compressWithBurrowsWheeler bs
|
||||||
|
= liftA2 BurrowsWheelerCompressed
|
||||||
|
(compress . map fst $ withCounts)
|
||||||
|
(compress . map snd $ withCounts)
|
||||||
|
where
|
||||||
|
withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
|
||||||
|
|
||||||
-- testCompression
|
-- testCompression
|
||||||
-- :: forall a
|
-- :: forall a
|
||||||
|
|
@ -100,10 +122,13 @@ compress bs =
|
||||||
-- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a))
|
-- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a))
|
||||||
-- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
|
-- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
|
||||||
|
|
||||||
counts :: (Ord a) => [a] -> [(a, Int)]
|
|
||||||
counts = M.toList . F.foldl' combiningInsert M.empty
|
-- I've replaced this with a simpler version but it might be slower
|
||||||
where
|
countOccurances :: (Ord a) => [a] -> [(a, Int)]
|
||||||
combiningInsert m key = M.insertWith (+) key 1 m
|
countOccurances = M.toList . M.fromListWith (+) . map (, 1)
|
||||||
|
-- countOccurances = M.toList . F.foldl' combiningInsert M.empty
|
||||||
|
-- where
|
||||||
|
-- combiningInsert m key = M.insertWith (+) key 1 m
|
||||||
|
|
||||||
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
|
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
|
||||||
divideByteString n [] = []
|
divideByteString n [] = []
|
||||||
|
|
@ -113,15 +138,17 @@ divideByteString n bs = x : divideByteString n xs
|
||||||
|
|
||||||
|
|
||||||
compressionRatioFor
|
compressionRatioFor
|
||||||
:: forall a
|
:: forall a b
|
||||||
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialize a)
|
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
|
||||||
|
Integral b, Ord b, C.Serialise b, Bounded b)
|
||||||
=> D.Proxy a
|
=> D.Proxy a
|
||||||
|
-> D.Proxy b
|
||||||
-> BS.ByteString
|
-> BS.ByteString
|
||||||
-> Double
|
-> Double
|
||||||
compressionRatioFor proxy bs =
|
compressionRatioFor _ _ bs =
|
||||||
(/ (fromIntegral . BS.length $ bs))
|
(/ (fromIntegral . BS.length $ bs))
|
||||||
. fromIntegral
|
. fromIntegral
|
||||||
. BS.length
|
. BS.length
|
||||||
. C.encode
|
. C.serialise
|
||||||
. (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a))
|
. (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
|
||||||
$ bs
|
$ bs
|
||||||
|
|
|
||||||
97
src/Compress/LengthDistancePairs.hs
Normal file
97
src/Compress/LengthDistancePairs.hs
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
module Compress.LengthDistancePairs where
|
||||||
|
import qualified Data.ByteString as BS
|
||||||
|
import Data.Word
|
||||||
|
import qualified Data.Foldable as F
|
||||||
|
import qualified Relude.Unsafe as US
|
||||||
|
import qualified Debug.Trace as D
|
||||||
|
import Compress.Arithmetic (maxBound')
|
||||||
|
import qualified Data.Proxy as P
|
||||||
|
import qualified Codec.Winery as C
|
||||||
|
import qualified Relude.Unsafe as U
|
||||||
|
|
||||||
|
data LengthDistancePair a = Literal Word8 | Repeat {
|
||||||
|
distance :: a,
|
||||||
|
matchLength :: Word8
|
||||||
|
}
|
||||||
|
deriving (Eq, Ord, Show, Generic, NFData)
|
||||||
|
deriving C.Serialise via (C.WineryVariant (LengthDistancePair a))
|
||||||
|
|
||||||
|
|
||||||
|
ldpLength :: LengthDistancePair a -> Int
|
||||||
|
ldpLength (Literal _) = 1
|
||||||
|
ldpLength (Repeat {..}) = fromIntegral matchLength
|
||||||
|
|
||||||
|
|
||||||
|
-- | normally, we'd order `previous` in reverse, but finding
|
||||||
|
-- | the longest-matching-subsequence starting at `next`
|
||||||
|
-- | doesn't let us.
|
||||||
|
-- | The linked list dosen't really lend itself to the problem
|
||||||
|
data Cursor = Cursor {
|
||||||
|
previous :: [Word8],
|
||||||
|
next :: [Word8]
|
||||||
|
} deriving (Eq, Ord, Show, Generic)
|
||||||
|
|
||||||
|
toCursor :: BS.ByteString -> Cursor
|
||||||
|
toCursor bs= Cursor {
|
||||||
|
previous = [],
|
||||||
|
next = BS.unpack bs
|
||||||
|
}
|
||||||
|
|
||||||
|
word8Max :: Int
|
||||||
|
word8Max = fromIntegral (maxBound :: Word8)
|
||||||
|
|
||||||
|
cutTo :: Int -> [a] -> [a]
|
||||||
|
cutTo newLength xs = drop (max 0 . flip (-) newLength . length $ xs) xs
|
||||||
|
|
||||||
|
decode :: forall a . Integral a => [LengthDistancePair a] -> [Word8]
|
||||||
|
decode = foldl' decodeIndividual []
|
||||||
|
where
|
||||||
|
decodeIndividual xs (Literal x) = xs ++ [x]
|
||||||
|
decodeIndividual xs (Repeat {..})
|
||||||
|
= (++) xs
|
||||||
|
. take (fromIntegral matchLength)
|
||||||
|
. drop (F.length xs - fromIntegral distance)
|
||||||
|
$ xs
|
||||||
|
|
||||||
|
encode :: forall a . (Integral a, Bounded a) => [Word8] -> [LengthDistancePair a]
|
||||||
|
encode xs = encode' xs xs 0
|
||||||
|
where
|
||||||
|
|
||||||
|
encode' :: [Word8] -> [Word8] -> Int -> [LengthDistancePair a]
|
||||||
|
encode' predictFrom [] n = []
|
||||||
|
encode' predictFrom next 0 = []
|
||||||
|
encode' predictFrom next distance = match : encode' (drop (maxLength - newDistance) predictFrom) (drop matchLength next) newDistance
|
||||||
|
where
|
||||||
|
(match, matchLength)
|
||||||
|
= F.maximumBy (comparing snd)
|
||||||
|
. map (\x -> (x, ldpLength x))
|
||||||
|
. ( Literal (U.head next) : )
|
||||||
|
. zipWith Repeat (reverse [1 .. (fromIntegral distance)])
|
||||||
|
. map lengthMatchWithRepeat . tails
|
||||||
|
. take distance
|
||||||
|
$ predictFrom
|
||||||
|
|
||||||
|
|
||||||
|
maxLength = fromIntegral (minBound :: a)
|
||||||
|
|
||||||
|
newDistance = max maxLength . (+ matchLength) $ distance
|
||||||
|
|
||||||
|
|
||||||
|
lengthMatchWithRepeat cyclexs' = zipWithCycle' cyclexs' next
|
||||||
|
where
|
||||||
|
zipWithCycle' [] ys = zipWithCycle' cyclexs' ys
|
||||||
|
zipWithCycle' (x:xs) (y:ys) | x == y = 1 + zipWithCycle' xs ys
|
||||||
|
zipWithCycle' _ _ = 0
|
||||||
|
|
||||||
|
-- lengthMatchWithRepeat candidate
|
||||||
|
-- = genericLength
|
||||||
|
-- . takeWhile (uncurry (==)) . zip (cycle candidate)
|
||||||
|
-- $ next
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
-- fasterEncode bs = encode' start start 0
|
||||||
|
-- where
|
||||||
|
-- encode' toPredict predictFrom maxDistance = (take diff previous)
|
||||||
|
|
||||||
|
-- start = BS.unpack bs take
|
||||||
|
|
@ -17,9 +17,9 @@ import Data.PQueue qualified as PQ
|
||||||
import Debug.Trace qualified as D
|
import Debug.Trace qualified as D
|
||||||
import Debug.Trace qualified as T
|
import Debug.Trace qualified as T
|
||||||
import GHC.Generics
|
import GHC.Generics
|
||||||
import qualified Data.Serialize as C
|
|
||||||
import Data.FiniteBit
|
import Data.FiniteBit
|
||||||
import qualified Data.FiniteBit as B
|
import qualified Data.FiniteBit as B
|
||||||
|
import qualified Codec.Winery as C
|
||||||
|
|
||||||
data Tree a = (Ord a) =>
|
data Tree a = (Ord a) =>
|
||||||
Tree
|
Tree
|
||||||
|
|
@ -28,7 +28,8 @@ data Tree a = (Ord a) =>
|
||||||
|
|
||||||
newtype HuffmanPrefixTree a b = HuffmanPrefixTree
|
newtype HuffmanPrefixTree a b = HuffmanPrefixTree
|
||||||
{ inner :: M.Map a (HuffmanTree b)
|
{ inner :: M.Map a (HuffmanTree b)
|
||||||
} deriving (Eq, Ord, Show, Generic, C.Serialize)
|
} deriving (Eq, Ord, Show, Generic)
|
||||||
|
deriving C.Serialise via C.WineryRecord (HuffmanPrefixTree a b)
|
||||||
|
|
||||||
finiteBitTupleUncons ::
|
finiteBitTupleUncons ::
|
||||||
forall a b.
|
forall a b.
|
||||||
|
|
@ -85,7 +86,7 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
|
||||||
tree = toHuffmanTree . nGramCounts $ bs
|
tree = toHuffmanTree . nGramCounts $ bs
|
||||||
|
|
||||||
treeDirMap :: M.Map a (M.Map b [TreeDir])
|
treeDirMap :: M.Map a (M.Map b [TreeDir])
|
||||||
treeDirMap = M.map HT.findTreeDirections . Compress.PrefixTree.inner $ tree
|
treeDirMap = M.map HT.findTreeDirections tree.inner
|
||||||
|
|
||||||
initial :: Maybe a
|
initial :: Maybe a
|
||||||
initial = fst <$> finiteBitUncons bs
|
initial = fst <$> finiteBitUncons bs
|
||||||
|
|
@ -96,37 +97,6 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
|
||||||
treeDirsFor :: (a, b) -> [TreeDir]
|
treeDirsFor :: (a, b) -> [TreeDir]
|
||||||
treeDirsFor (a, b) = (treeDirMap M.! a) M.! b
|
treeDirsFor (a, b) = (treeDirMap M.! a) M.! b
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
-- | all (M.null . children) . M.elems . children $ tree =
|
|
||||||
-- fmap End
|
|
||||||
-- . HT.fromList
|
|
||||||
-- . map (\x -> (prefixCounts x, x))
|
|
||||||
-- . M.keys
|
|
||||||
-- . children
|
|
||||||
-- $ tree
|
|
||||||
-- | otherwise =
|
|
||||||
-- Just
|
|
||||||
-- . Layer
|
|
||||||
-- . M.mapMaybeWithKey (\key val -> toHuffmanTree' (key : soFar) val)
|
|
||||||
-- . children
|
|
||||||
-- $ tree
|
|
||||||
-- where
|
|
||||||
-- prefixCounts :: a -> Int
|
|
||||||
-- prefixCounts x =
|
|
||||||
-- fromIntegral
|
|
||||||
-- . sum
|
|
||||||
-- . M.elems
|
|
||||||
-- . M.filterWithKey (\key val -> L.isPrefixOf (reverse . (x :) $ soFar) key)
|
|
||||||
-- $ nGrams
|
|
||||||
|
|
||||||
-- toHuffmanTree :: Tree a -> p1 -> HuffmanTree a
|
|
||||||
-- toHuffmanTree :: forall a . Tree a -> M.Map [a] Word -> HuffmanTree [a]
|
|
||||||
-- toHuffmanTree (Tree {..}) nGrams soFar | M.size children == 1 = Leaf . map (reverse . (: soFar)) . M.keys $ children
|
|
||||||
-- toHuffmanTree (Tree {..}) nGrams soFar = Leaf . map (reverse . (: soFar)) . M.keys $ children
|
|
||||||
-- where
|
|
||||||
-- sorted = L.sortBy (prefixCounts . fst) . M.toList $ children
|
|
||||||
|
|
||||||
nGramCounts ::
|
nGramCounts ::
|
||||||
forall a b.
|
forall a b.
|
||||||
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
||||||
|
|
@ -155,34 +125,5 @@ fromSingleList (x : xs) = Tree . M.singleton x . fromSingleList $ xs
|
||||||
fromList :: (Ord a) => [[a]] -> Tree a
|
fromList :: (Ord a) => [[a]] -> Tree a
|
||||||
fromList = F.foldl' merge Compress.PrefixTree.empty . map fromSingleList
|
fromList = F.foldl' merge Compress.PrefixTree.empty . map fromSingleList
|
||||||
|
|
||||||
-- insert :: Ord a => Tree a -> [a] -> Tree a
|
|
||||||
-- insert (Tree {..}) (x:xs) =
|
|
||||||
|
|
||||||
merge :: Tree a -> Tree a -> Tree a
|
merge :: Tree a -> Tree a -> Tree a
|
||||||
merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1
|
merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1
|
||||||
|
|
||||||
-- deriving instance Eq (Tree a)
|
|
||||||
|
|
||||||
-- deriving instance Ord (Tree a)
|
|
||||||
|
|
||||||
-- deriving instance (Show a) => Show (Tree a)
|
|
||||||
|
|
||||||
-- empty :: (Ord a) => Tree a
|
|
||||||
-- empty = Tree M.empty
|
|
||||||
|
|
||||||
-- fromList :: (Ord a, F.Foldable t) => t [a] -> Tree a
|
|
||||||
-- fromList = F.foldl' insert empty
|
|
||||||
|
|
||||||
-- insert :: Tree a -> [a] -> Tree a
|
|
||||||
-- insert (Tree {..}) [] = Tree M.empty
|
|
||||||
-- insert (Tree {..}) (x : xs) =
|
|
||||||
-- Tree
|
|
||||||
-- . flip (M.insert x) children
|
|
||||||
-- . flip insert xs
|
|
||||||
-- . My.fromMaybe empty
|
|
||||||
-- . M.lookup x
|
|
||||||
-- $ children
|
|
||||||
|
|
||||||
-- lookup :: (Ord a) => [a] -> Tree a -> Bool
|
|
||||||
-- lookup [] = const True
|
|
||||||
-- lookup (x : xs) = maybe False (Compress.PrefixTree.lookup xs) . M.lookup x . children
|
|
||||||
|
|
|
||||||
8
src/Compress/WordMarkovStats.hs
Normal file
8
src/Compress/WordMarkovStats.hs
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
module Compress.WordMarkovStats where
|
||||||
|
|
||||||
|
|
||||||
|
data WordMarkovStats = WordMarkovStats
|
||||||
|
{ location :: Ratio Integer,
|
||||||
|
size :: Ratio Integer
|
||||||
|
}
|
||||||
|
deriving (Show, Eq, Ord, Generic)
|
||||||
62
src/Data/ArbitraryPrecisionFloatingPoint.hs
Normal file
62
src/Data/ArbitraryPrecisionFloatingPoint.hs
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
module Data.ArbitraryPrecisionFloatingPoint where
|
||||||
|
|
||||||
|
import Compress.Arithmetic (binarySearch)
|
||||||
|
import Data.Function.Pointless
|
||||||
|
|
||||||
|
data FloatingPoint = FloatingPoint
|
||||||
|
{ exponent :: Integer,
|
||||||
|
significand :: Integer
|
||||||
|
}
|
||||||
|
deriving (Eq)
|
||||||
|
|
||||||
|
instance Num FloatingPoint where
|
||||||
|
(+) :: FloatingPoint -> FloatingPoint -> FloatingPoint
|
||||||
|
(+) = onEqualExponent (+)
|
||||||
|
|
||||||
|
(*) :: FloatingPoint -> FloatingPoint -> FloatingPoint
|
||||||
|
(*) n0 n1 = correctExponent $ FloatingPoint (n0.exponent + n1.exponent) (n0.significand * n1.significand)
|
||||||
|
|
||||||
|
abs :: FloatingPoint -> FloatingPoint
|
||||||
|
abs (FloatingPoint {..}) = FloatingPoint exponent (abs significand)
|
||||||
|
|
||||||
|
negate :: FloatingPoint -> FloatingPoint
|
||||||
|
negate (FloatingPoint {..}) = FloatingPoint exponent (negate significand)
|
||||||
|
|
||||||
|
fromInteger :: Integer -> FloatingPoint
|
||||||
|
fromInteger = correctExponent . FloatingPoint 0
|
||||||
|
|
||||||
|
signum :: FloatingPoint -> FloatingPoint
|
||||||
|
signum = FloatingPoint 0 . signum . (.significand)
|
||||||
|
|
||||||
|
onEqualExponent :: (Integer -> Integer -> Integer) -> FloatingPoint -> FloatingPoint -> FloatingPoint
|
||||||
|
onEqualExponent f n0 n1 = correctExponent . FloatingPoint n0'.exponent $ on f (.significand) n0' n1'
|
||||||
|
where
|
||||||
|
(n0', n1') = equalizeExponents n0 n1
|
||||||
|
|
||||||
|
-- | significand shouldn't have trailing 0s in binary
|
||||||
|
-- | exponent should be one larger instead
|
||||||
|
-- | this function corrects until siginicand isn't divisible by 2
|
||||||
|
correctExponent :: FloatingPoint -> FloatingPoint
|
||||||
|
correctExponent (FloatingPoint {..})
|
||||||
|
| significand /= 0 && remainder == 0 = correctExponent $ FloatingPoint (succ exponent) quotient
|
||||||
|
| otherwise = (FloatingPoint {..})
|
||||||
|
where
|
||||||
|
(quotient, remainder) = divMod significand 2
|
||||||
|
|
||||||
|
equalizeExponents :: FloatingPoint -> FloatingPoint -> (FloatingPoint, FloatingPoint)
|
||||||
|
equalizeExponents n0 n1 =
|
||||||
|
( withDiff n0 exponent0Diff,
|
||||||
|
withDiff n1 exponent1Diff
|
||||||
|
)
|
||||||
|
where
|
||||||
|
withDiff n exponentDiff = FloatingPoint (n.exponent + exponentDiff) (n.significand * (2 ^ exponentDiff))
|
||||||
|
|
||||||
|
signedExponentDiff = n0.exponent - n1.exponent
|
||||||
|
|
||||||
|
(exponent0Diff, exponent1Diff)
|
||||||
|
| signedExponentDiff < 0 = (0, abs signedExponentDiff)
|
||||||
|
| otherwise = (signedExponentDiff, 0)
|
||||||
|
|
||||||
|
instance Ord FloatingPoint where
|
||||||
|
compare :: FloatingPoint -> FloatingPoint -> Ordering
|
||||||
|
compare = uncurry (on compare (.significand)) .: equalizeExponents
|
||||||
52
src/Data/Dirs.hs
Normal file
52
src/Data/Dirs.hs
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
-- |
|
||||||
|
|
||||||
|
module Data.Dirs where
|
||||||
|
import qualified Data.ByteString as BS
|
||||||
|
import qualified Data.Vector.Unboxed as VU
|
||||||
|
import qualified Data.Vector as V
|
||||||
|
import qualified Data.Bit as VB
|
||||||
|
import qualified Data.Bit as BV
|
||||||
|
import Data.Ratio
|
||||||
|
import Control.Monad.Par qualified as P
|
||||||
|
import Codec.Winery qualified as C
|
||||||
|
import qualified Relude as R
|
||||||
|
|
||||||
|
data Dir = Higher | Lower deriving Show
|
||||||
|
|
||||||
|
data SerializedDirs = SerializedDirs {
|
||||||
|
bs :: BS.ByteString,
|
||||||
|
length :: Int
|
||||||
|
} deriving (Generic, Show, Ord, Eq, P.NFData, C.Serialise)
|
||||||
|
|
||||||
|
-- toBS :: [Dir] -> SerializedDirs
|
||||||
|
serialize dirs = SerializedDirs {..}
|
||||||
|
where
|
||||||
|
bs = VB.cloneToByteString . VU.fromList . map dirToBit $ dirs
|
||||||
|
length = R.length dirs
|
||||||
|
|
||||||
|
deserialize :: SerializedDirs -> Rational
|
||||||
|
deserialize (SerializedDirs {..}) = fromDirs . map bitToDir . VU.toList $ asVec
|
||||||
|
where
|
||||||
|
asVec = VU.take length . VB.cloneFromByteString $ bs
|
||||||
|
|
||||||
|
addNewBit x Higher = (x * 2) + 1
|
||||||
|
addNewBit x Lower = x * 2
|
||||||
|
|
||||||
|
fromDirs = sum . (0.5 :) . zipWith (*) fractions . map dirToNum
|
||||||
|
where
|
||||||
|
|
||||||
|
addNewBit x Higher = (x * 2) + 1
|
||||||
|
addNewBit x Lower = x * 2
|
||||||
|
|
||||||
|
|
||||||
|
dirToNum Higher = 1
|
||||||
|
dirToNum Lower = -1
|
||||||
|
|
||||||
|
bitToDir (BV.Bit True ) = Higher
|
||||||
|
bitToDir (BV.Bit False) = Lower
|
||||||
|
|
||||||
|
dirToBit Higher = BV.Bit True
|
||||||
|
dirToBit Lower = BV.Bit False
|
||||||
|
|
||||||
|
fractions :: [Rational]
|
||||||
|
fractions = iterate ( / 2) (1 % 4)
|
||||||
|
|
@ -9,6 +9,9 @@ import Data.Vector.Unboxed qualified as V
|
||||||
import Data.Bit qualified as B
|
import Data.Bit qualified as B
|
||||||
import Data.Bit qualified as BV
|
import Data.Bit qualified as BV
|
||||||
import Data.ByteString qualified as BS
|
import Data.ByteString qualified as BS
|
||||||
|
import qualified Codec.Winery as C
|
||||||
|
import Data.FiniteBit (toByteString)
|
||||||
|
import Codec.Winery (serialise)
|
||||||
|
|
||||||
data HuffmanTree a
|
data HuffmanTree a
|
||||||
= Leaf a
|
= Leaf a
|
||||||
|
|
@ -16,24 +19,29 @@ data HuffmanTree a
|
||||||
{ left :: HuffmanTree a,
|
{ left :: HuffmanTree a,
|
||||||
right :: HuffmanTree a
|
right :: HuffmanTree a
|
||||||
}
|
}
|
||||||
deriving (Eq, Ord, Show, Generic, C.Serialize, Functor)
|
deriving (Eq, Ord, Show, Generic, Functor)
|
||||||
|
deriving C.Serialise via (C.WineryVariant (HuffmanTree a))
|
||||||
|
|
||||||
|
|
||||||
-- here so we can define our own Serialize instance
|
-- here so we can define our own Serialize instance
|
||||||
newtype TreeDirs = TreeDirs {
|
newtype TreeDirs = TreeDirs {
|
||||||
inner :: [TreeDir]
|
inner :: [TreeDir]
|
||||||
} deriving (Eq, Ord, Show)
|
} deriving (Eq, Ord, Show)
|
||||||
|
-- deriving C.Serialize via C.WineryVariant TreeDirs
|
||||||
|
|
||||||
data TreeDir = L | R deriving (Eq, Ord, Show)
|
data TreeDir = L | R deriving (Eq, Ord, Show)
|
||||||
|
|
||||||
instance C.Serialize TreeDirs where
|
newtype SerializedTreeDirs = SerializedTreeDirs
|
||||||
put :: C.Putter TreeDirs
|
{
|
||||||
put = C.put . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) . (inner :: TreeDirs -> [TreeDir])
|
inner :: BS.ByteString
|
||||||
|
} deriving (Generic, Eq, Ord)
|
||||||
|
deriving C.Serialise via (C.WineryRecord SerializedTreeDirs)
|
||||||
|
|
||||||
get :: C.Get TreeDirs
|
serializeTreeDirs (TreeDirs inner) = SerializedTreeDirs . fst . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) $ inner
|
||||||
get = do
|
|
||||||
(bs, len) <- C.get
|
unserializeTreeDirs (SerializedTreeDirs bs) = TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
|
||||||
pure . TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
|
where
|
||||||
|
len = BS.length bs
|
||||||
|
|
||||||
lookup ::
|
lookup ::
|
||||||
forall a.
|
forall a.
|
||||||
|
|
|
||||||
|
|
@ -32,10 +32,10 @@ maxView :: PQueue a -> Maybe (Int, (a, PQueue a))
|
||||||
maxView = abstractView IM.lookupMax
|
maxView = abstractView IM.lookupMax
|
||||||
|
|
||||||
insert :: Int -> p -> PQueue p -> PQueue p
|
insert :: Int -> p -> PQueue p -> PQueue p
|
||||||
insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . toMap
|
insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . (.toMap)
|
||||||
|
|
||||||
elems :: PQueue a -> [a]
|
elems :: PQueue a -> [a]
|
||||||
elems = concatMap NE.toList . IM.elems . toMap
|
elems = concatMap NE.toList . IM.elems . (.toMap)
|
||||||
|
|
||||||
keys :: PQueue a -> [IM.Key]
|
keys :: PQueue a -> [IM.Key]
|
||||||
keys = IM.keys . toMap
|
keys = IM.keys . (.toMap)
|
||||||
|
|
|
||||||
|
|
@ -1,46 +1,62 @@
|
||||||
|
-- | note to myself. Something's wrong here.
|
||||||
|
-- | it doesn't work properly when fed into Compress.Arithmetic
|
||||||
|
|
||||||
module Data.Word4 where
|
module Data.Word4 where
|
||||||
|
|
||||||
import Data.Bits
|
import Data.Bits
|
||||||
import Data.Word
|
import Data.Word
|
||||||
import qualified Data.Serialize as C
|
import qualified Codec.Winery as C
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
|
import qualified Text.Show as S
|
||||||
|
import qualified Relude.Unsafe as U
|
||||||
|
import Codec.Winery (WineryRecord)
|
||||||
|
|
||||||
-- only use lower 4 bits
|
-- only use lower 4 bits
|
||||||
newtype Word4 = Word4 {
|
newtype Word4 = Word4 {
|
||||||
inner :: Word8
|
inner :: Word8
|
||||||
} deriving (Eq, Ord, Show, Enum, Generic, C.Serialize)
|
} deriving (Eq, Ord, Enum, Generic)
|
||||||
|
deriving C.Serialise via C.WineryRecord Word4
|
||||||
|
|
||||||
|
instance S.Show Word4 where
|
||||||
|
show = show . (.inner)
|
||||||
|
|
||||||
|
|
||||||
map2 :: (Word8 -> Word8 -> Word8) -> Word4 -> Word4 -> Word4
|
map2 :: (Word8 -> Word8 -> Word8) -> Word4 -> Word4 -> Word4
|
||||||
map2 f (Word4 a) (Word4 b) = Word4 $ f a b
|
map2 f (Word4 a) (Word4 b) = Word4 $ f a b
|
||||||
|
|
||||||
data CompressWord4List = CompressWord4List
|
newtype CompressWord4List = CompressWord4List
|
||||||
{ xs :: [Word8],
|
{ xs :: [Word8]
|
||||||
last :: Maybe Word8 -- in lower 4 bits
|
} deriving (Eq, Ord, Show, Generic)
|
||||||
} deriving (Eq, Ord, Generic, Show, C.Serialize, NFData)
|
deriving C.Serialise via C.WineryVariant CompressWord4List
|
||||||
|
|
||||||
bitwiseAnd a b = getAnd $ And a <> And b
|
bitwiseAnd a b = getAnd $ And a <> And b
|
||||||
|
|
||||||
bitwiseOr a b = getIor $ Ior a <> Ior b
|
bitwiseOr a b = getIor $ Ior a <> Ior b
|
||||||
|
|
||||||
toCompressedWord4List [] = CompressWord4List [] Nothing
|
|
||||||
toCompressedWord4List [Word4 x] = CompressWord4List [] (Just x)
|
toCompressedWord4List = toCompressedWord4List' . U.init
|
||||||
toCompressedWord4List ((Word4 x) : (Word4 x') : xs) =
|
where
|
||||||
|
toCompressedWord4List' [] = CompressWord4List []
|
||||||
|
toCompressedWord4List' [Word4 x] = CompressWord4List [x .<<. 4]
|
||||||
|
toCompressedWord4List' ((Word4 x) : (Word4 x') : xs) =
|
||||||
CompressWord4List
|
CompressWord4List
|
||||||
{ xs = headByte : xs',
|
{ xs = headByte : xs'
|
||||||
last = last'
|
|
||||||
}
|
}
|
||||||
where
|
where
|
||||||
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
|
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
|
||||||
|
|
||||||
(CompressWord4List xs' last') = toCompressedWord4List xs
|
(CompressWord4List xs') = toCompressedWord4List' xs
|
||||||
|
|
||||||
|
|
||||||
fromCompressedWord4List :: CompressWord4List -> [Word4]
|
fromCompressedWord4List :: CompressWord4List -> [Word4]
|
||||||
fromCompressedWord4List (CompressWord4List [] Nothing) = []
|
fromCompressedWord4List = fromCompressedWord4List' . (.xs)
|
||||||
fromCompressedWord4List (CompressWord4List [] (Just a)) = [word4 a]
|
where
|
||||||
fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
|
fromCompressedWord4List' :: [Word8] -> [Word4]
|
||||||
Word4 ((bitwiseAnd 0xf0 x) .>>. 4)
|
fromCompressedWord4List' [] = []
|
||||||
|
fromCompressedWord4List' (x : xs) =
|
||||||
|
word4 (x .>>. 4)
|
||||||
: word4 x
|
: word4 x
|
||||||
: fromCompressedWord4List (CompressWord4List xs last)
|
: fromCompressedWord4List' xs
|
||||||
|
|
||||||
-- instance Show Word4 where
|
-- instance Show Word4 where
|
||||||
-- show :: Word4 -> String
|
-- show :: Word4 -> String
|
||||||
|
|
@ -48,10 +64,10 @@ fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
|
||||||
|
|
||||||
instance Bounded Word4 where
|
instance Bounded Word4 where
|
||||||
minBound = word4 0
|
minBound = word4 0
|
||||||
maxBound = word4 0xf
|
maxBound = word4 15
|
||||||
|
|
||||||
instance Real Word4 where
|
instance Real Word4 where
|
||||||
toRational = toRational . inner
|
toRational = toRational . (.inner)
|
||||||
|
|
||||||
instance Integral Word4 where
|
instance Integral Word4 where
|
||||||
quot = map2 quot `on` clean
|
quot = map2 quot `on` clean
|
||||||
|
|
@ -66,9 +82,9 @@ instance Integral Word4 where
|
||||||
divMod (Word4 a) (Word4 b) = (word4 a', word4 b')
|
divMod (Word4 a) (Word4 b) = (word4 a', word4 b')
|
||||||
where
|
where
|
||||||
(a', b') = divMod a b
|
(a', b') = divMod a b
|
||||||
toInteger = toInteger . inner . clean
|
toInteger = toInteger . (.inner) . clean
|
||||||
|
|
||||||
clean (Word4 a) = Word4 . bitwiseAnd 0xf $ a
|
clean (Word4 a) = Word4 . bitwiseAnd 0x0f $ a
|
||||||
word4 = clean . Word4
|
word4 = clean . Word4
|
||||||
|
|
||||||
instance Num Word4 where
|
instance Num Word4 where
|
||||||
|
|
|
||||||
71
src/Main.hs
71
src/Main.hs
|
|
@ -1,71 +0,0 @@
|
||||||
module Main where
|
|
||||||
|
|
||||||
import Compress.Huffman
|
|
||||||
import Compress.Huffman qualified as HT
|
|
||||||
import Compress.PrefixTree (HuffmanPrefixTree (HuffmanPrefixTree))
|
|
||||||
import Compress.PrefixTree qualified as PT
|
|
||||||
import Compress.Arithmetic qualified as A
|
|
||||||
import Data.Bifunctor qualified as Bi
|
|
||||||
import Data.Bit qualified as B
|
|
||||||
import Data.Bits (Bits ((.|.)))
|
|
||||||
import Data.Bits qualified as B
|
|
||||||
import Data.ByteString qualified as BS
|
|
||||||
import Data.HuffmanTree
|
|
||||||
import Data.Proxy qualified as P
|
|
||||||
import Data.Serialize qualified as C
|
|
||||||
import Data.Word
|
|
||||||
import GHC.Generics (Generic)
|
|
||||||
import Options.Generic qualified as O
|
|
||||||
import qualified Data.FiniteBit as FB
|
|
||||||
|
|
||||||
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
|
||||||
|
|
||||||
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
|
||||||
|
|
||||||
data CLIOpts = CLIOpts
|
|
||||||
{ task :: CompressOrDecompress,
|
|
||||||
strategy :: CompressionStrategy
|
|
||||||
}
|
|
||||||
deriving (Show, Generic, O.ParseRecord)
|
|
||||||
|
|
||||||
applyCompressionOptions ::
|
|
||||||
forall a.
|
|
||||||
(Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialize a) =>
|
|
||||||
P.Proxy a ->
|
|
||||||
CLIOpts ->
|
|
||||||
BS.ByteString ->
|
|
||||||
BS.ByteString
|
|
||||||
applyCompressionOptions _ (CLIOpts Compress Huffman) f =
|
|
||||||
C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
|
|
||||||
applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
|
|
||||||
C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
|
||||||
applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
|
|
||||||
handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
|
|
||||||
applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
|
|
||||||
handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
|
||||||
|
|
||||||
handleError (Right (Just bs)) = bs
|
|
||||||
handleError _ = []
|
|
||||||
|
|
||||||
main :: IO ()
|
|
||||||
main = do
|
|
||||||
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
|
|
||||||
f <- BS.getContents
|
|
||||||
let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
|
|
||||||
let compressedUnencoded :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) = (A.compress fAsWords)
|
|
||||||
let compressed = C.encode compressedUnencoded
|
|
||||||
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
|
|
||||||
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
|
|
||||||
|
|
||||||
-- let f = "hello tehre"
|
|
||||||
-- f <- BS.readFile "pg64317.txt"
|
|
||||||
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
|
|
||||||
-- print $ BS.length . C.encode $ compressed
|
|
||||||
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
|
|
||||||
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
|
|
||||||
-- BS.writeFile "outin.txt" decompressed
|
|
||||||
-- print (decompressed, f)
|
|
||||||
-- print $ BS.length decompressed
|
|
||||||
-- print $ BS.length f
|
|
||||||
-- print (decompressed == f)
|
|
||||||
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f
|
|
||||||
92
test/Test.hs
Normal file
92
test/Test.hs
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
import Compress.Arithmetic (twoByteMarkov)
|
||||||
|
import Compress.Arithmetic qualified as A
|
||||||
|
import Compress.LengthDistancePairs as LDP
|
||||||
|
import Compress.WordMarkovStats as WM
|
||||||
|
import Data.Bifunctor qualified as Bi
|
||||||
|
import Data.ByteString qualified as BS
|
||||||
|
import Data.Dirs as D
|
||||||
|
import Data.Map.Strict qualified as M
|
||||||
|
import Data.Ratio
|
||||||
|
import Test.Falsify.Generator qualified as Gen
|
||||||
|
import Test.Falsify.Predicate as P
|
||||||
|
import Test.Falsify.Property qualified as P
|
||||||
|
import Test.Falsify.Range qualified as Range
|
||||||
|
import Test.Tasty
|
||||||
|
import Test.Tasty.Falsify as F
|
||||||
|
import Test.Tasty.HUnit
|
||||||
|
|
||||||
|
-- import qualified GHC.Enum as B
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
toCompress <- -- BS.take 12000 <$>
|
||||||
|
BS.readFile "pg64317.txt"
|
||||||
|
defaultMain (tests toCompress)
|
||||||
|
|
||||||
|
tests toCompress =
|
||||||
|
testGroup
|
||||||
|
"falsify"
|
||||||
|
[ testGroup
|
||||||
|
"Arithmetic"
|
||||||
|
[ testCase "works" $
|
||||||
|
A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
|
||||||
|
@?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
|
||||||
|
testCase "relativeCounts works as expected with one param" $
|
||||||
|
A.relativeCounts [(0, 30)]
|
||||||
|
@?= [(0, 1)],
|
||||||
|
testCase "relativeCounts works as expected with lots of params" $
|
||||||
|
A.relativeCounts [(0, 30), (1, 20)]
|
||||||
|
@?= [(0, 30 % 50), (1, 20 % 50)],
|
||||||
|
testCase "toRing locs all less than 1" $
|
||||||
|
assertBool "larger than one" $
|
||||||
|
all (all ((<= 1) . (.location))) toCompressRing,
|
||||||
|
testCase "toRing sizes all add up to 1" $
|
||||||
|
assertBool "larger than 1" $
|
||||||
|
all ((== 1) . sum . map (.size)) toCompressRing,
|
||||||
|
testCase "toRing gives no zero sizes" $
|
||||||
|
assertBool "== 0" $
|
||||||
|
all (all ((/= 0) . (.size))) toCompressRing,
|
||||||
|
F.testProperty "binary search" propBinarySearchWithinBounds,
|
||||||
|
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
|
||||||
|
]
|
||||||
|
-- testGroup
|
||||||
|
-- "LengthDistancePair"
|
||||||
|
-- [ F.testProperty "compress and decompress isomorphism" $ (propCompressDecompressIsomorphism (LDP.decode . (LDP.encode :: [Word8] -> [LDP.LengthDistancePair Word32])))
|
||||||
|
-- ]
|
||||||
|
]
|
||||||
|
where
|
||||||
|
toCompressRing =
|
||||||
|
map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
|
||||||
|
. M.elems
|
||||||
|
. A.twoByteMarkov
|
||||||
|
. BS.unpack
|
||||||
|
$ toCompress
|
||||||
|
|
||||||
|
wordMaxBound :: Integer
|
||||||
|
wordMaxBound = fromIntegral (maxBound :: Word)
|
||||||
|
|
||||||
|
genProperFraction = F.gen . fmap ((% wordMaxBound) . fromIntegral) . Gen.inRange $ (Range.between (0, maxBound :: Word))
|
||||||
|
|
||||||
|
-- propBinarySearchWithinBounds :: Property' String ()
|
||||||
|
propBinarySearchWithinBounds = do
|
||||||
|
bound1 <- genProperFraction
|
||||||
|
epsilon' <- genProperFraction
|
||||||
|
let epsilon = max (1 % (fromIntegral (maxBound :: Word))) . (* 0.1) $ bound1 * epsilon'
|
||||||
|
let bound2 = if bound1 + epsilon < 1 then bound1 + epsilon else bound1 - epsilon
|
||||||
|
-- let bound2 = min 1 $ bound1 + epsilon
|
||||||
|
let iso = D.deserialize . D.serialize $ A.binarySearch bound1 bound2
|
||||||
|
P.assert $ P.le .$ ("minimum", min bound1 bound2) .$ ("iso", iso)
|
||||||
|
P.assert $ P.ge .$ ("maximum", max bound1 bound2) .$ ("iso", iso)
|
||||||
|
|
||||||
|
-- P.assert $ Range.between (min bound1 bound2) (max bound1 bound2) .$ (A.fromDirs $ A.binarySearch bound1 bound2)
|
||||||
|
|
||||||
|
-- propCompressDecompressIsomorphism :: F.Property ()
|
||||||
|
-- propCompressDecompressIsomorphism :: Property' String ()
|
||||||
|
propCompressDecompressIsomorphism :: ([Word8] -> [Word8]) -> Property' String ()
|
||||||
|
propCompressDecompressIsomorphism iso = do
|
||||||
|
(xs :: [Word8]) <-
|
||||||
|
F.gen $ Gen.list (Range.between (4, 5000)) intGen
|
||||||
|
P.assert $ eq .$ ("xs", xs) .$ ("changed", iso $ xs)
|
||||||
|
where
|
||||||
|
intGen :: F.Gen Word8
|
||||||
|
intGen = Gen.inRange $ Range.between (0, fromIntegral (maxBound :: Word8))
|
||||||
Loading…
Add table
Add a link
Reference in a new issue