benchmark, test suite, and run length encoding
This commit is contained in:
parent
2123636291
commit
da83f9a5d0
19 changed files with 884 additions and 364 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -6,3 +6,4 @@
|
|||
/cabal.project.local
|
||||
/compress.hp
|
||||
/*.prof
|
||||
/calgarycorpus/
|
||||
|
|
|
|||
15
bench/Bench.hs
Normal file
15
bench/Bench.hs
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
import Criterion.Main
|
||||
import Compress.LengthDistancePairs
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Compress.Arithmetic as A
|
||||
|
||||
main = do
|
||||
testData <- BS.unpack . BS.take 10000 <$> (BS.readFile "pg64317.txt")
|
||||
defaultMain [
|
||||
bgroup "encode" [
|
||||
bench "length distance pair" $ nf (encode :: [Word8] -> [LengthDistancePair Word32]) testData
|
||||
],
|
||||
bgroup "arithmetic coding" [
|
||||
bench "pg64317.txt" $ nf A.compress testData
|
||||
]
|
||||
]
|
||||
|
|
@ -6,23 +6,26 @@ build-type: Simple
|
|||
license: MIT
|
||||
license-file: LICENSE
|
||||
|
||||
executable compress
|
||||
main-is:
|
||||
Main.hs
|
||||
common deps
|
||||
default-extensions:
|
||||
DataKinds,
|
||||
DeriveAnyClass,
|
||||
DeriveGeneric,
|
||||
DerivingStrategies,
|
||||
DerivingVia,
|
||||
DuplicateRecordFields,
|
||||
ExtendedDefaultRules,
|
||||
FlexibleContexts,
|
||||
FlexibleInstances,
|
||||
ImpredicativeTypes,
|
||||
InstanceSigs,
|
||||
LambdaCase,
|
||||
MultiParamTypeClasses,
|
||||
NamedFieldPuns,
|
||||
NoFieldSelectors,
|
||||
OverloadedLabels,
|
||||
OverloadedLists,
|
||||
OverloadedRecordDot,
|
||||
OverloadedStrings,
|
||||
PartialTypeSignatures,
|
||||
RankNTypes,
|
||||
|
|
@ -32,15 +35,12 @@ executable compress
|
|||
StandaloneDeriving,
|
||||
StrictData,
|
||||
TemplateHaskell,
|
||||
LambdaCase,
|
||||
TupleSections,
|
||||
TypeApplications,
|
||||
TypeFamilies,
|
||||
TypeOperators,
|
||||
TypeSynonymInstances,
|
||||
UndecidableInstances,
|
||||
hs-source-dirs:
|
||||
src
|
||||
mixins:
|
||||
base hiding (Prelude),
|
||||
relude (Relude as Prelude),
|
||||
|
|
@ -48,17 +48,21 @@ executable compress
|
|||
build-depends:
|
||||
base,
|
||||
bitvec,
|
||||
bytestring,
|
||||
cereal,
|
||||
bytestring,
|
||||
containers,
|
||||
winery,
|
||||
parsec,
|
||||
-- accelerate,
|
||||
-- containers-accelerate,
|
||||
parsec,
|
||||
leancheck,
|
||||
monad-par,
|
||||
monad-par-extras,
|
||||
nonempty-containers,
|
||||
optparse-generic,
|
||||
relude,
|
||||
pointless-fun,
|
||||
text,
|
||||
uuid,
|
||||
vector,
|
||||
|
|
@ -66,15 +70,63 @@ executable compress
|
|||
default-language:
|
||||
GHC2021
|
||||
other-modules:
|
||||
Data.PQueue
|
||||
Data.FiniteBit
|
||||
Compress.Arithmetic
|
||||
Compress.BurrowsWheeler
|
||||
Compress.Huffman
|
||||
Compress.PrefixTree
|
||||
Compress.WordMarkovStats
|
||||
Compress.LengthDistancePairs
|
||||
Data.Dirs
|
||||
Data.FiniteBit
|
||||
Data.HuffmanTree
|
||||
Compress.Arithmetic
|
||||
Data.PQueue
|
||||
Data.Word4
|
||||
Data.ArbitraryPrecisionFloatingPoint
|
||||
-- Data.CircularList
|
||||
hs-source-dirs:
|
||||
src
|
||||
ghc-options:
|
||||
-threaded
|
||||
-fprof-auto
|
||||
-fprof-late
|
||||
"-with-rtsopts=-p -hc -B -N -qa"
|
||||
"-with-rtsopts=-N"
|
||||
|
||||
|
||||
benchmark bench
|
||||
import: deps
|
||||
type:
|
||||
exitcode-stdio-1.0
|
||||
main-is:
|
||||
Bench.hs
|
||||
build-depends:
|
||||
criterion
|
||||
|
||||
hs-source-dirs:
|
||||
bench
|
||||
ghc-options:
|
||||
"-with-rtsopts=-T -p -hc -B -qa"
|
||||
|
||||
test-suite test
|
||||
import: deps
|
||||
type:
|
||||
exitcode-stdio-1.0
|
||||
main-is:
|
||||
Test.hs
|
||||
build-depends:
|
||||
tasty,
|
||||
tasty-hunit,
|
||||
falsify
|
||||
|
||||
hs-source-dirs:
|
||||
test
|
||||
|
||||
|
||||
executable compress
|
||||
import: deps
|
||||
|
||||
main-is:
|
||||
Main.hs
|
||||
|
||||
hs-source-dirs:
|
||||
src-exe
|
||||
|
||||
|
||||
-- -fllvm
|
||||
|
|
|
|||
131
flake.lock
generated
131
flake.lock
generated
|
|
@ -1,18 +1,34 @@
|
|||
{
|
||||
"nodes": {
|
||||
"accelerate": {
|
||||
"barbies": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1732969010,
|
||||
"narHash": "sha256-Qrmtrgij2GbklBXUK42Pt6Db8WiGijA5sz5oC5AR72c=",
|
||||
"owner": "AccelerateHS",
|
||||
"repo": "accelerate",
|
||||
"rev": "02da6161ef143a9886c8bce542cd96029c4f527a",
|
||||
"lastModified": 1712605099,
|
||||
"narHash": "sha256-jDyIDPiGWAw4qLRoYA4p6njANOg4/EOCx0jmFl607IM=",
|
||||
"owner": "jcpetruzza",
|
||||
"repo": "barbies",
|
||||
"rev": "856bc3d3cc72a13e95ed495afd15683c45c7cc55",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "AccelerateHS",
|
||||
"repo": "accelerate",
|
||||
"owner": "jcpetruzza",
|
||||
"repo": "barbies",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"barbies-th": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1730091166,
|
||||
"narHash": "sha256-kc3ObxnhAJW6vgroFIax/qBOrp3HIWoLRtu0jsJUGIE=",
|
||||
"owner": "fumieval",
|
||||
"repo": "barbies-th",
|
||||
"rev": "46c7b8c68634b219ff12e7966983f9b46a5976d4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "fumieval",
|
||||
"repo": "barbies-th",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
|
|
@ -21,11 +37,11 @@
|
|||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1733312601,
|
||||
"narHash": "sha256-4pDvzqnegAfRkPwO3wmwBhVi/Sye1mzps0zHWYnP88c=",
|
||||
"lastModified": 1756770412,
|
||||
"narHash": "sha256-+uWLQZccFHwqpGqr2Yt5VsW/PbeJVTn9Dk6SHWhNRPw=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "205b12d8b7cd4802fbcb8e8ef6a0f1408781a4f9",
|
||||
"rev": "4524271976b625a4a605beefd893f270620fd751",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
@ -51,11 +67,11 @@
|
|||
},
|
||||
"haskell-flake": {
|
||||
"locked": {
|
||||
"lastModified": 1734984991,
|
||||
"narHash": "sha256-oUYtRBD3Yhw2jvKYo0lfd82fgEQQbFoiJcHO923gmOc=",
|
||||
"lastModified": 1756607542,
|
||||
"narHash": "sha256-+99fEAk0HwjYgIW2tEOs7ayBDxnU9NAM5E29ZxgyX40=",
|
||||
"owner": "srid",
|
||||
"repo": "haskell-flake",
|
||||
"rev": "daf00052906bdd977e57a07f7048437214232e87",
|
||||
"rev": "73e3891fb135c679a1c30fae4b101e5b41b8ca61",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
@ -64,42 +80,6 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"llvm-hs": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1665495332,
|
||||
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
|
||||
"owner": "llvm-hs",
|
||||
"repo": "llvm-hs",
|
||||
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "llvm-hs",
|
||||
"ref": "llvm-12",
|
||||
"repo": "llvm-hs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"llvm-hs-pure": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"dir": "llvm-hs-pure",
|
||||
"lastModified": 1665495332,
|
||||
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
|
||||
"owner": "llvm-hs",
|
||||
"repo": "llvm-hs",
|
||||
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"dir": "llvm-hs-pure",
|
||||
"owner": "llvm-hs",
|
||||
"ref": "llvm-12",
|
||||
"repo": "llvm-hs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"mission-control": {
|
||||
"locked": {
|
||||
"lastModified": 1733438716,
|
||||
|
|
@ -117,11 +97,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1734649271,
|
||||
"narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=",
|
||||
"lastModified": 1756542300,
|
||||
"narHash": "sha256-tlOn88coG5fzdyqz6R93SQL5Gpq+m/DsWpekNFhqPQk=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507",
|
||||
"rev": "d7600c775f877cd87b4f5a831c28aa94137377aa",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
@ -133,28 +113,31 @@
|
|||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1733096140,
|
||||
"narHash": "sha256-1qRH7uAUsyQI7R1Uwl4T+XvdNv778H0Nb5njNrqvylY=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
|
||||
"lastModified": 1754788789,
|
||||
"narHash": "sha256-x2rJ+Ovzq0sCMpgfgGaaqgBSwY+LST+WbZ6TytnT9Rk=",
|
||||
"owner": "nix-community",
|
||||
"repo": "nixpkgs.lib",
|
||||
"rev": "a73b9c743612e4244d865a2fdee11865283c04e6",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/5487e69da40cbd611ab2cadee0b4637225f7cfae.tar.gz"
|
||||
"owner": "nix-community",
|
||||
"repo": "nixpkgs.lib",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"accelerate": "accelerate",
|
||||
"barbies": "barbies",
|
||||
"barbies-th": "barbies-th",
|
||||
"flake-parts": "flake-parts",
|
||||
"flake-root": "flake-root",
|
||||
"haskell-flake": "haskell-flake",
|
||||
"llvm-hs": "llvm-hs",
|
||||
"llvm-hs-pure": "llvm-hs-pure",
|
||||
"mission-control": "mission-control",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"systems": "systems",
|
||||
"treefmt-nix": "treefmt-nix"
|
||||
"treefmt-nix": "treefmt-nix",
|
||||
"winery": "winery"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
|
|
@ -179,11 +162,11 @@
|
|||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1734982074,
|
||||
"narHash": "sha256-N7M37KP7cHWoXicuE536GrVvU8nMDT/gpI1kja2hkdg=",
|
||||
"lastModified": 1756662192,
|
||||
"narHash": "sha256-F1oFfV51AE259I85av+MAia221XwMHCOtZCMcZLK2Jk=",
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"rev": "e41e948cf097cbf96ba4dff47a30ea6891af9f33",
|
||||
"rev": "1aabc6c05ccbcbf4a635fb7a90400e44282f61c4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
@ -191,6 +174,22 @@
|
|||
"repo": "treefmt-nix",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"winery": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1732329065,
|
||||
"narHash": "sha256-CDFOD4B6cJgAe3ebTVF21/eCJtbHMEzmKF3b/XxMOw4=",
|
||||
"owner": "fumieval",
|
||||
"repo": "winery",
|
||||
"rev": "2d6976bc822f6d2c2d590703b9470cb9e62191a5",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "fumieval",
|
||||
"repo": "winery",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
|
|
|||
75
flake.nix
75
flake.nix
|
|
@ -8,19 +8,24 @@
|
|||
flake-root.url = "github:srid/flake-root";
|
||||
treefmt-nix.url = "github:numtide/treefmt-nix";
|
||||
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
|
||||
accelerate = {
|
||||
url = "github:AccelerateHS/accelerate?submodules=1";
|
||||
flake = false;
|
||||
};
|
||||
llvm-hs = {
|
||||
url = "github:llvm-hs/llvm-hs?ref=llvm-12";
|
||||
flake = false;
|
||||
};
|
||||
llvm-hs-pure = {
|
||||
url = "github:llvm-hs/llvm-hs?dir=llvm-hs-pure&ref=llvm-12";
|
||||
flake = false;
|
||||
};
|
||||
mission-control.url = "github:Platonic-Systems/mission-control";
|
||||
|
||||
winery = {
|
||||
url = "github:fumieval/winery";
|
||||
flake = false;
|
||||
};
|
||||
|
||||
barbies-th = {
|
||||
url = "github:fumieval/barbies-th";
|
||||
flake = false;
|
||||
};
|
||||
|
||||
|
||||
barbies = {
|
||||
url = "github:jcpetruzza/barbies";
|
||||
flake = false;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
outputs = inputs:
|
||||
|
|
@ -38,7 +43,7 @@
|
|||
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
|
||||
haskellProjects.default = {
|
||||
# The base package set (this value is the default)
|
||||
basePackages = pkgs.haskell.packages.ghc96;
|
||||
# basePackages = pkgs.haskell.packages.ghc910;
|
||||
|
||||
# Packages to add on top of `basePackages`
|
||||
packages = {
|
||||
|
|
@ -46,30 +51,32 @@
|
|||
# (Local packages are added automatically)
|
||||
# https://github.com/lehins/hip.git
|
||||
# hip.source = inputs.hip + "/hip";
|
||||
accelerate.source = inputs.accelerate;
|
||||
# llvm-hs.source = inputs.llvm-hs;
|
||||
llvm-hs-pure.source = inputs.llvm-hs-pure;
|
||||
winery.source = inputs.winery;
|
||||
barbies-th.source = inputs.barbies-th;
|
||||
barbies.source = inputs.barbies;
|
||||
};
|
||||
|
||||
# Add your package overrides here
|
||||
settings = {
|
||||
uuid.jailbreak = true;
|
||||
accelerate =
|
||||
{
|
||||
jailbreak = true;
|
||||
broken = false;
|
||||
};
|
||||
llvm-hs =
|
||||
{
|
||||
jailbreak = true;
|
||||
broken = false;
|
||||
};
|
||||
llvm-hs-pure =
|
||||
{
|
||||
jailbreak = true;
|
||||
broken = false;
|
||||
};
|
||||
|
||||
# accelerate =
|
||||
# {
|
||||
# jailbreak = true;
|
||||
# broken = false;
|
||||
# };
|
||||
# winery.broken = false;
|
||||
winery.jailbreak = true;
|
||||
winery.check = false;
|
||||
barbies-th.broken = false;
|
||||
barbies-th.jailbreak = true;
|
||||
barbies.broken = false;
|
||||
barbies.jailbreak = true;
|
||||
barbies.check = false;
|
||||
tree-diff.check = false;
|
||||
# hlint.jailbreak = true;
|
||||
# ghc-lib-parser.jailbreak = true;
|
||||
# ghc-lib-parser-ex.jailbreak = true;
|
||||
# gmp-with-cxx.jailbreak = true;
|
||||
};
|
||||
# hlint = {
|
||||
# jailbreak = true;
|
||||
|
|
@ -96,7 +103,7 @@
|
|||
programs.ormolu.enable = true;
|
||||
programs.nixpkgs-fmt.enable = true;
|
||||
programs.cabal-fmt.enable = true;
|
||||
programs.hlint.enable = false;
|
||||
# programs.hlint.enable = false;
|
||||
|
||||
# We use fourmolu
|
||||
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
|
||||
|
|
@ -110,7 +117,6 @@
|
|||
hoogle = {
|
||||
description = "Start Hoogle server for project dependencies";
|
||||
exec = ''
|
||||
echo http://127.0.0.1:8888;
|
||||
hoogle serve -p 8888 --local;
|
||||
'';
|
||||
category = "Dev Tools";
|
||||
|
|
@ -139,6 +145,7 @@
|
|||
config.treefmt.build.devShell
|
||||
];
|
||||
nativeBuildInputs = with pkgs; [
|
||||
# llvmPackages_16.libllvm
|
||||
];
|
||||
};
|
||||
};
|
||||
|
|
|
|||
97
src-exe/Main.hs
Normal file
97
src-exe/Main.hs
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
module Main where
|
||||
|
||||
import Compress.Huffman
|
||||
import Compress.Huffman qualified as HT
|
||||
import Compress.PrefixTree qualified as PT
|
||||
import Compress.Arithmetic qualified as A
|
||||
import Data.Bifunctor qualified as Bi
|
||||
import Data.Bit qualified as B
|
||||
import Data.Bits (Bits ((.|.)))
|
||||
import Data.Bits qualified as B
|
||||
import Data.ByteString qualified as BS
|
||||
import Data.HuffmanTree
|
||||
import Data.Proxy qualified as P
|
||||
import Codec.Winery qualified as C
|
||||
import Data.Word
|
||||
import GHC.Generics (Generic)
|
||||
import Options.Generic qualified as O
|
||||
import qualified Data.FiniteBit as FB
|
||||
import qualified Relude.Unsafe as U
|
||||
import qualified Data.ByteString.Lazy as BSL
|
||||
-- import Data.Word4 (Word4(Word4))
|
||||
import Compress.BurrowsWheeler
|
||||
import qualified Control.Monad.Par as P
|
||||
import qualified Compress.LengthDistancePairs as LDP
|
||||
|
||||
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
||||
|
||||
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
||||
|
||||
data CLIOpts = CLIOpts
|
||||
{ task :: CompressOrDecompress,
|
||||
strategy :: CompressionStrategy
|
||||
}
|
||||
deriving (Show, Generic, O.ParseRecord)
|
||||
|
||||
-- applyCompressionOptions ::
|
||||
-- forall a.
|
||||
-- (Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialise a) =>
|
||||
-- P.Proxy a ->
|
||||
-- CLIOpts ->
|
||||
-- BS.By(++ []) . teString ->
|
||||
-- BS.ByteString
|
||||
-- applyCompressionOptions _ (CLIOpts Compress Huffman) f =
|
||||
-- C.serialise . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
|
||||
-- applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
|
||||
-- C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
||||
-- applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
|
||||
-- handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
|
||||
-- applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
|
||||
-- handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
||||
|
||||
handleError (Right (Just bs)) = bs
|
||||
handleError _ = []
|
||||
|
||||
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
|
||||
f <- BS.getContents
|
||||
let f' = BS.unpack f
|
||||
print . BS.pack . A.decompress . A.compress $ f'
|
||||
-- print =<< (P.runParIO $ do
|
||||
-- let fCounts = (toCounts :: [Word8] -> [(Word8, Word64)]) . {- toBurrowsWheeler . -} (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
|
||||
-- let fAsWords = map fst fCounts
|
||||
-- -- let compressedUnencoded {- :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) -} = (A.compress fAsWords)
|
||||
-- -- print . A.talkAboutCompressed $ compressedUnencoded
|
||||
-- -- let compressed = C.serialise (compressedUnencoded, A.compress . map snd $ fCounts)
|
||||
-- -- huffmanCompressionRatio' <- P.spawnP $ (compressionRatioFor (Proxy :: Proxy Word8) (P.Proxy :: Proxy Word8) f)
|
||||
-- -- arithmaticCodingCompressionRatio' <- P.spawnP $ ((fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
|
||||
-- -- huffmanCompressionRatio <- P.get huffmanCompressionRatio'
|
||||
-- -- arithmaticCodingCompressionRatio <- P.get arithmaticCodingCompressionRatio'
|
||||
-- -- let lengthDistancePairsCompressedSize = fromIntegral . BS.length . C.serialise . (LDP.encode :: ByteString -> [LDP.LengthDistancePair Word16]) $ f
|
||||
-- -- let lengthDistancePairsCompressionRatio :: Double = lengthDistancePairsCompressedSize / (fromIntegral $ BS.length f)
|
||||
-- pure ()) -- (lengthDistancePairsCompressionRatio))
|
||||
-- let decompressed = (A.decompress compressedUnencoded)
|
||||
-- print ("huffman coding", huffmanCompressionRatio)
|
||||
-- print ("compression ratio (arithmetic coding)", arithmaticCodingCompressionRatio)
|
||||
-- print ("works?", decompressed == fAsWords)
|
||||
|
||||
|
||||
-- print . take 10 . drop 70 . zip fAsWords $ decompressed
|
||||
|
||||
-- print . ("original length", ) . length $ fAsWords
|
||||
|
||||
-- let f = "hello tehre"
|
||||
-- f <- BS.readFile "pg64317.txt"
|
||||
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
|
||||
-- print $ BS.length . C.encode $ compressed
|
||||
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
|
||||
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
|
||||
-- BS.writeFile "outin.txt" decompressed
|
||||
-- print (decompressed, f)
|
||||
-- print $ BS.length decompressed
|
||||
-- print $ BS.length f
|
||||
-- print (decompressed == f)
|
||||
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
-- | https://en.wikipedia.org/wiki/Arithmetic_coding
|
||||
module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized) where
|
||||
-- module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized, Word4Map) where
|
||||
module Compress.Arithmetic where
|
||||
|
||||
import Data.Bits qualified as B
|
||||
import Data.Word
|
||||
|
|
@ -16,12 +17,23 @@ import Data.Map.Strict qualified as M
|
|||
import Data.Maybe qualified as My
|
||||
import Data.Ord
|
||||
import Data.Ratio
|
||||
import Data.Serialize qualified as C
|
||||
import Codec.Winery qualified as C
|
||||
import GHC.Generics (Generic)
|
||||
import GHC.Natural (Natural)
|
||||
import qualified Relude.Unsafe as US
|
||||
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Relude.Unsafe as U
|
||||
import Codec.Winery (WineryRecord)
|
||||
import Compress.BurrowsWheeler (toBurrowsWheeler)
|
||||
import qualified Data.Bit as BV
|
||||
import qualified Data.Vector as V
|
||||
import qualified Data.Bit as VB
|
||||
import qualified Data.Vector.Unboxed as VU
|
||||
import Data.Dirs
|
||||
-- import qualified Debug.Trace as D
|
||||
import qualified Data.Proxy as P
|
||||
import Compress.WordMarkovStats as WM
|
||||
|
||||
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
||||
|
||||
|
|
@ -29,41 +41,52 @@ import qualified Data.ByteString as BS
|
|||
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
|
||||
|
||||
|
||||
data WordMarkovStats = WordMarkovStats
|
||||
{ location :: Ratio Integer,
|
||||
size :: Ratio Integer
|
||||
|
||||
}
|
||||
deriving (Show, Eq, Ord, Generic, C.Serialize)
|
||||
|
||||
|
||||
loc' (WordMarkovStats {..}) = location
|
||||
|
||||
toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
||||
toRing :: forall a k. (Show a, Show k, Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
||||
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
|
||||
where
|
||||
sizes = map ((% maxBound') . fromIntegral . snd) xs
|
||||
|
||||
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
|
||||
|
||||
asFracsOfTotalSum = map fst . US.tail . L.scanl' f (0, 0) $ sizes
|
||||
|
||||
maxBound' :: Integer
|
||||
maxBound' = fromIntegral (maxBound :: a)
|
||||
|
||||
resize xs = map ( / sum' ) xs
|
||||
where
|
||||
sum' = sum xs
|
||||
|
||||
withBumpedZeroSized :: [Rational]
|
||||
withBumpedZeroSized
|
||||
| numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum
|
||||
| otherwise = asFracsOfTotalSum
|
||||
= resize
|
||||
. map (max minimumSize)
|
||||
. resize
|
||||
. map (fromIntegral . snd)
|
||||
$ xs
|
||||
where
|
||||
-- | we have to round the 0s up to something
|
||||
-- | we somewhat arbitrarily choose half of the smallest
|
||||
-- | normally representable amount
|
||||
minimumSize :: Rational
|
||||
minimumSize = 1 % (maxBound' * 2)
|
||||
|
||||
remainingSpace = 1 - sum asFracsOfTotalSum
|
||||
|
||||
numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum
|
||||
|
||||
f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum)
|
||||
where
|
||||
newFrac = currFrac * (1 - runningSum)
|
||||
|
||||
twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)]
|
||||
relativeCounts :: forall k . Ord k => [(k, Integer)] -> [(k, Rational)]
|
||||
relativeCounts m = map (Bi.second asFractionOfSum) m
|
||||
where
|
||||
asFractionOfSum = ( % sum') . fromIntegral
|
||||
|
||||
sum' = fromIntegral . F.sum . map snd $ m
|
||||
|
||||
|
||||
|
||||
maxBound' :: forall a b . (Num b, Bounded a, Integral a) => P.Proxy a -> b
|
||||
maxBound' p = fromIntegral (maxBound :: a)
|
||||
|
||||
twoByteMarkov :: forall k . (Ord k) => [k] -> M.Map k [(k, Rational)]
|
||||
twoByteMarkov xs =
|
||||
M.map sizeAsFraction
|
||||
. M.fromListWith (M.unionWith (+))
|
||||
|
|
@ -72,77 +95,106 @@ twoByteMarkov xs =
|
|||
. US.tail
|
||||
$ xs
|
||||
where
|
||||
-- optimization, changes fractions so they represent the fraction
|
||||
-- of remaining space in the list taken up instead of total space
|
||||
sizeAsFraction :: M.Map k Integer -> [(k, b)]
|
||||
sizeAsFraction m =
|
||||
zip keys
|
||||
. map discretizeFraction
|
||||
. fractionOfRemainingSums
|
||||
$ counts
|
||||
where
|
||||
fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs
|
||||
|
||||
asList = L.sortOn (negate . snd) . M.assocs $ m
|
||||
sizeAsFraction :: M.Map k Integer -> [(k, Rational)]
|
||||
sizeAsFraction = relativeCounts . M.assocs
|
||||
|
||||
keys = map fst asList
|
||||
data Chunk a = Chunk
|
||||
{ location :: SerializedDirs,
|
||||
start :: a,
|
||||
length :: Word64
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic, P.NFData)
|
||||
deriving C.Serialise via C.WineryVariant (Chunk a)
|
||||
|
||||
counts = map snd asList
|
||||
-- maxBound = maxBound' (P.Proxy :: Proxy b)
|
||||
|
||||
discretizeFraction :: Rational -> b
|
||||
discretizeFraction = floor . fromRational . (* maxBound')
|
||||
discretizeFraction :: forall b . (Bounded b, Integral b) => Rational -> b
|
||||
discretizeFraction = floor . min maxBound'' . fromRational . (* (succ $ maxBound' (P.Proxy :: P.Proxy b)))
|
||||
where
|
||||
maxBound'' = maxBound' (P.Proxy :: P.Proxy b)
|
||||
|
||||
maxBound' :: Ratio Integer
|
||||
maxBound' = fromIntegral (maxBound :: b)
|
||||
|
||||
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
|
||||
type Word4Map a = M.Map a [(a, Word4)]
|
||||
type Word4Map a = M.Map a [(a, Word8)]
|
||||
|
||||
|
||||
-- talkAboutCompressed (Compressed {..}) = (
|
||||
-- ("markovs (serealized)", BS.length . C.serialiseOnly $ markovs),
|
||||
-- ("markovs (unserealized)", BS.length . C.serialiseOnly . unserealizeWord4Map $ markovs),
|
||||
-- ("chunks", BS.length . C.serialiseOnly $ chunks))
|
||||
|
||||
newtype Prediction = Prediction {
|
||||
chances :: [(Word8, Rational)]
|
||||
}
|
||||
|
||||
class CompressionModel a where
|
||||
createModel :: [Word8] -> a
|
||||
|
||||
-- done in reverse
|
||||
predict :: a -> [Word8] -> Prediction
|
||||
|
||||
-- at some point write
|
||||
-- instance IsoSerialisable a b => C.Serialise b
|
||||
class C.Serialise b => IsoSerialisable a b where
|
||||
toSerialisable :: a -> b
|
||||
|
||||
fromSerialisable :: b -> a
|
||||
|
||||
-- newtype BetterCompressed = BetterCompressed (Compressed Word8 (A.Word4MapSerialized Word8))
|
||||
-- deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
|
||||
-- deriving C.Serialise via C.WineryRecord BetterCompressed
|
||||
|
||||
data Compressed a mapImplementation = Compressed
|
||||
{ markovs :: mapImplementation,
|
||||
chunks :: [Chunk a]
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic) -- , C.Serialize, P.NFData)
|
||||
deriving (Eq, Ord, Show, Generic, NFData) -- , C.Serialise, P.NFData)
|
||||
deriving C.Serialise via C.WineryVariant (Compressed a mapImplementation)
|
||||
|
||||
|
||||
deriving instance (Ord a, C.Serialize a) => C.Serialize (Compressed a (Word4MapSerialized a))
|
||||
|
||||
data Chunk a = Chunk
|
||||
{ location :: Ratio Integer,
|
||||
start :: a,
|
||||
length :: Int
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
|
||||
-- unserealizeWord4Map :: forall a . Show a => Word4MapSerialized a -> Word4Map a
|
||||
-- unserealizeWord4Map = M.map unserializedIndividualMap
|
||||
-- where
|
||||
-- unserializedIndividualMap :: ([a], CompressWord4List) -> [(a, Word4)]
|
||||
-- unserializedIndividualMap (bytes, sizes) = zip bytes . fromCompressedWord4List $ sizes
|
||||
|
||||
unserealizeWord4Map :: Word4MapSerialized a -> Word4Map a
|
||||
unserealizeWord4Map = M.map (uncurry zip . Bi.second ((++ [maxBound]) . fromCompressedWord4List))
|
||||
-- -- unserealizeWord4Map = M.map (uncurry zip . map (\(bytes, sizes) -> (bytes,) . (++ [maxBound]) . fromCompressedWord4List (Prelude.length bytes) $ sizes))
|
||||
|
||||
serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
|
||||
serealizeWord4Map = M.map (Bi.second (toCompressedWord4List . L.init) . L.unzip)
|
||||
|
||||
-- serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
|
||||
-- serealizeWord4Map = M.map (Bi.second toCompressedWord4List . L.unzip)
|
||||
|
||||
decompress ::
|
||||
forall a.
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||
Compressed a (Word4MapSerialized a) ->
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||
Compressed a (Word4Map a) ->
|
||||
[a]
|
||||
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
|
||||
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk
|
||||
$ chunks
|
||||
where
|
||||
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
|
||||
|
||||
markovs' = unserealizeWord4Map markovs
|
||||
markovs' = -- unserealizeWord4Map
|
||||
markovs
|
||||
|
||||
|
||||
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
||||
|
||||
decompressChunk :: Chunk a -> [a]
|
||||
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
|
||||
decompressChunk (Chunk {..}) = map snd . take (fromIntegral length) . L.iterate' decompress' $ (location', start) -- (fromBS rawLocation, start)
|
||||
where
|
||||
|
||||
location' = deserialize location
|
||||
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
||||
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
|
||||
decompress' (!loc, !prevWord) = -- traceShow (newSize) $
|
||||
((loc - ansLoc) / newSize, newVal)
|
||||
where
|
||||
ansLoc :: Ratio Integer
|
||||
newVal :: a
|
||||
newSize :: Ratio Integer
|
||||
(!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev)
|
||||
(ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prevWord)
|
||||
|
||||
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
||||
sanityCheck = map (sum . M.elems) . M.elems
|
||||
|
|
@ -156,18 +208,31 @@ chunk chunkSize = chunk'
|
|||
(xs', xs'') = splitAt chunkSize xs
|
||||
|
||||
chunkLength :: Int
|
||||
chunkLength = 8000
|
||||
chunkLength = 4096
|
||||
|
||||
|
||||
-- runPar = id
|
||||
-- parMap = map
|
||||
|
||||
-- compress ::
|
||||
-- forall a.
|
||||
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||
-- [a] ->
|
||||
-- Compressed a (Word4MapSerialized a)
|
||||
compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))) $ Compressed (serealizeWord4Map twoByteMarkovs) . P.runPar . P.parMap compressChunk $ unCompressedChunks
|
||||
compress :: (NFData a, Show a, Ord a) => [a] -> Compressed a (Word4Map a)
|
||||
compress toCompress' = Compressed twoByteMarkovs . P.runPar . P.parMap
|
||||
(compressChunk)
|
||||
$ unCompressedChunks
|
||||
where
|
||||
|
||||
toCompress = toCompress' -- toBurrowsWheeler toCompress'
|
||||
-- toTraceShow' = (unserealizeWord4Map . serealizeWord4Map $ twoByteMarkovs) == twoByteMarkovs
|
||||
|
||||
-- toTraceShow = (fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))
|
||||
|
||||
unCompressedChunks = chunk chunkLength toCompress
|
||||
|
||||
twoByteMarkovs = twoByteMarkov toCompress
|
||||
twoByteMarkovs = M.map (map (Bi.second discretizeFraction)) . twoByteMarkov $ toCompress
|
||||
|
||||
rings = M.map toRing twoByteMarkovs
|
||||
|
||||
|
|
@ -175,12 +240,23 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
|
|||
Chunk
|
||||
{ location = shortestLocation endStats,
|
||||
start = US.head toCompress,
|
||||
length = L.length toCompress
|
||||
length = fromIntegral . L.length $ toCompress
|
||||
}
|
||||
where
|
||||
pairs = zip toCompress . US.tail $ toCompress
|
||||
|
||||
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
|
||||
shortestLocation (WordMarkovStats {..}) = serialize $ binarySearch location (location + size)
|
||||
-- shortestLocation (WordMarkovStats {..})
|
||||
-- -- | simplestBetweenAnswer /= upperBound = simplestBetweenAnswer
|
||||
-- | otherwise = simplestBetween location $ upperBound - epsilon
|
||||
-- where
|
||||
-- simplestBetweenAnswer = simplestBetween location upperBound
|
||||
|
||||
-- upperBound = location + size
|
||||
|
||||
-- -- | almost entirely arbitrary, picked because it's gaurenteed to be smaller than size
|
||||
-- -- | and should make for a fairly quick subtraction
|
||||
-- epsilon = 1 % (denominator size)
|
||||
|
||||
endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs
|
||||
|
||||
|
|
@ -194,6 +270,7 @@ compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.
|
|||
size = prevSize * nextSize
|
||||
}
|
||||
|
||||
|
||||
pyramidFold :: (a -> a -> a) -> [a] -> a
|
||||
pyramidFold f = pyramid
|
||||
where
|
||||
|
|
@ -209,6 +286,7 @@ simplestBetween :: Rational -> Rational -> Rational
|
|||
simplestBetween x y
|
||||
| x == y = x
|
||||
| x > 0 = simplestBetween' n d n' d'
|
||||
-- | x > 0 = simplestBetween' n d n' d'
|
||||
| otherwise = 0 % 1
|
||||
where
|
||||
n = numerator x
|
||||
|
|
@ -226,3 +304,15 @@ simplestBetween x y
|
|||
nd'' = simplestBetween' d' r' d r
|
||||
n'' = numerator nd''
|
||||
d'' = denominator nd''
|
||||
|
||||
binarySearch x y = binarySearch' 0 1
|
||||
where
|
||||
acceptLow = min x y
|
||||
acceptHigh = max x y
|
||||
|
||||
binarySearch' searchSpaceLow searchSpaceHigh
|
||||
| mid < acceptLow = Higher : binarySearch' mid searchSpaceHigh
|
||||
| mid > acceptHigh = Lower : binarySearch' searchSpaceLow mid
|
||||
| otherwise = []
|
||||
where
|
||||
mid = (searchSpaceLow + searchSpaceHigh) / 2
|
||||
|
|
|
|||
27
src/Compress/BurrowsWheeler.hs
Normal file
27
src/Compress/BurrowsWheeler.hs
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
-- |
|
||||
|
||||
module Compress.BurrowsWheeler where
|
||||
-- import qualified Data.CircularList as CL
|
||||
import qualified Relude.Unsafe as U
|
||||
-- import Compress.BurrowsWheeler (toBurrowsWheeler)
|
||||
|
||||
|
||||
toBurrowsWheeler (x:xs) = map U.head . sortOn (U.tail) . take (pred . length $ xs) . tails $ xs'
|
||||
where
|
||||
xs' = toList $ (x :| xs) <> (x :| [])
|
||||
|
||||
toCounts :: forall a b . (Eq a, Eq b, Num b, Enum b, Bounded a, Bounded b) => [a] -> [(a, b)]
|
||||
toCounts = reverse . foldl' f []
|
||||
where
|
||||
f [] x = [(x, 0)]
|
||||
f ((x, count) : xs) newVal
|
||||
| x == newVal && count == maxBound = (x, 0) : (x, count) : xs
|
||||
| x == newVal = (x, succ count) : xs
|
||||
| otherwise = (newVal, 0) : (x, count) : xs
|
||||
|
||||
|
||||
-- toBurrowsWheeler xs = map last . sort . map (findLyndonWord) . take (length xs) . tails $ xs'
|
||||
-- where
|
||||
-- xs' = xs ++ xs
|
||||
|
||||
-- findLyndonWord (x : xs) = (x :|) . map snd . takeWhile (uncurry (<=)) . zip (x : xs) $ xs
|
||||
|
|
@ -20,7 +20,7 @@ import Data.Bits as B
|
|||
import Data.Maybe (fromMaybe)
|
||||
import Data.Maybe qualified as My
|
||||
import Data.PQueue qualified as PQ
|
||||
import Data.Serialize qualified as C
|
||||
import Codec.Winery qualified as C
|
||||
import Data.Text qualified as T
|
||||
import Data.Text.IO qualified as TIO
|
||||
import Data.Vector.Unboxed qualified as V
|
||||
|
|
@ -30,6 +30,8 @@ import GHC.Generics (Generic)
|
|||
import Options.Generic qualified as O
|
||||
import System.Environment qualified as SE
|
||||
import Data.HuffmanTree
|
||||
import Data.Word4 (CompressWord4List(xs))
|
||||
import Compress.BurrowsWheeler (toBurrowsWheeler, toCounts)
|
||||
|
||||
decompress
|
||||
:: forall a
|
||||
|
|
@ -50,13 +52,27 @@ decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompre
|
|||
nextLeaf (L : xs) (Node {..}) = nextLeaf xs left
|
||||
nextLeaf (R : xs) (Node {..}) = nextLeaf xs right
|
||||
|
||||
data BurrowsWheelerCompressed a b = BurrowsWheelerCompressed {
|
||||
elements :: Compressed SerializedTreeDirs a,
|
||||
counts :: Compressed SerializedTreeDirs b
|
||||
}
|
||||
deriving (Eq, Ord, Generic) -- , C.Serialise, P.NFData)
|
||||
deriving C.Serialise via C.WineryRecord (BurrowsWheelerCompressed a b)
|
||||
|
||||
data Compressed treeDir a = Compressed {
|
||||
treeDirs :: treeDir,
|
||||
huffman :: HuffmanTree a
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic) -- , C.Serialise, P.NFData)
|
||||
deriving C.Serialise via C.WineryRecord (Compressed treeDir a)
|
||||
|
||||
compress
|
||||
:: forall a
|
||||
. (Ord a, Integral a, B.FiniteBits a, B.Bits a)
|
||||
=> BS.ByteString
|
||||
-> Maybe (TreeDirs, HuffmanTree a)
|
||||
compress bs =
|
||||
liftA2 (,) (TreeDirs <$> treeDirections) mergedHuffmanTrees
|
||||
. (Ord a, Integral a)
|
||||
=> [a]
|
||||
-> Maybe (Compressed SerializedTreeDirs a)
|
||||
compress dividedByteString =
|
||||
liftA2 Compressed (serializeTreeDirs . TreeDirs <$> treeDirections) mergedHuffmanTrees
|
||||
where
|
||||
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
|
||||
|
||||
|
|
@ -64,13 +80,19 @@ compress bs =
|
|||
mergeHuffmanTrees
|
||||
. PQ.fromList
|
||||
. map (uncurry (flip (,)) . Bi.first Leaf)
|
||||
. counts
|
||||
. countOccurances
|
||||
$ dividedByteString
|
||||
|
||||
treeDirMap :: M.Map a [TreeDir]
|
||||
treeDirMap = My.maybe M.empty findTreeDirections mergedHuffmanTrees
|
||||
|
||||
dividedByteString = toWordsList bs
|
||||
compressWithBurrowsWheeler :: (Integral b, Integral a, FiniteBits a, Bounded b, Bounded a) => ByteString -> Maybe (BurrowsWheelerCompressed a b)
|
||||
compressWithBurrowsWheeler bs
|
||||
= liftA2 BurrowsWheelerCompressed
|
||||
(compress . map fst $ withCounts)
|
||||
(compress . map snd $ withCounts)
|
||||
where
|
||||
withCounts = toCounts . toBurrowsWheeler . toWordsList $ bs
|
||||
|
||||
-- testCompression
|
||||
-- :: forall a
|
||||
|
|
@ -100,10 +122,13 @@ compress bs =
|
|||
-- decodeCompressed :: forall a. (Ord a, Integral a, B.FiniteBitsOps a, C.Serialize a) => BS.ByteString -> Either String (Maybe (TreeDirs, HuffmanTree a))
|
||||
-- decodeCompressed = Bi.second (fmap (Bi.first decodeTreeDirs)) . C.decode
|
||||
|
||||
counts :: (Ord a) => [a] -> [(a, Int)]
|
||||
counts = M.toList . F.foldl' combiningInsert M.empty
|
||||
where
|
||||
combiningInsert m key = M.insertWith (+) key 1 m
|
||||
|
||||
-- I've replaced this with a simpler version but it might be slower
|
||||
countOccurances :: (Ord a) => [a] -> [(a, Int)]
|
||||
countOccurances = M.toList . M.fromListWith (+) . map (, 1)
|
||||
-- countOccurances = M.toList . F.foldl' combiningInsert M.empty
|
||||
-- where
|
||||
-- combiningInsert m key = M.insertWith (+) key 1 m
|
||||
|
||||
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
|
||||
divideByteString n [] = []
|
||||
|
|
@ -113,15 +138,17 @@ divideByteString n bs = x : divideByteString n xs
|
|||
|
||||
|
||||
compressionRatioFor
|
||||
:: forall a
|
||||
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialize a)
|
||||
:: forall a b
|
||||
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialise a, Bounded a,
|
||||
Integral b, Ord b, C.Serialise b, Bounded b)
|
||||
=> D.Proxy a
|
||||
-> D.Proxy b
|
||||
-> BS.ByteString
|
||||
-> Double
|
||||
compressionRatioFor proxy bs =
|
||||
compressionRatioFor _ _ bs =
|
||||
(/ (fromIntegral . BS.length $ bs))
|
||||
. fromIntegral
|
||||
. BS.length
|
||||
. C.encode
|
||||
. (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a))
|
||||
. C.serialise
|
||||
. (compressWithBurrowsWheeler :: ByteString -> Maybe (BurrowsWheelerCompressed a b))
|
||||
$ bs
|
||||
|
|
|
|||
97
src/Compress/LengthDistancePairs.hs
Normal file
97
src/Compress/LengthDistancePairs.hs
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
module Compress.LengthDistancePairs where
|
||||
import qualified Data.ByteString as BS
|
||||
import Data.Word
|
||||
import qualified Data.Foldable as F
|
||||
import qualified Relude.Unsafe as US
|
||||
import qualified Debug.Trace as D
|
||||
import Compress.Arithmetic (maxBound')
|
||||
import qualified Data.Proxy as P
|
||||
import qualified Codec.Winery as C
|
||||
import qualified Relude.Unsafe as U
|
||||
|
||||
data LengthDistancePair a = Literal Word8 | Repeat {
|
||||
distance :: a,
|
||||
matchLength :: Word8
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic, NFData)
|
||||
deriving C.Serialise via (C.WineryVariant (LengthDistancePair a))
|
||||
|
||||
|
||||
ldpLength :: LengthDistancePair a -> Int
|
||||
ldpLength (Literal _) = 1
|
||||
ldpLength (Repeat {..}) = fromIntegral matchLength
|
||||
|
||||
|
||||
-- | normally, we'd order `previous` in reverse, but finding
|
||||
-- | the longest-matching-subsequence starting at `next`
|
||||
-- | doesn't let us.
|
||||
-- | The linked list dosen't really lend itself to the problem
|
||||
data Cursor = Cursor {
|
||||
previous :: [Word8],
|
||||
next :: [Word8]
|
||||
} deriving (Eq, Ord, Show, Generic)
|
||||
|
||||
toCursor :: BS.ByteString -> Cursor
|
||||
toCursor bs= Cursor {
|
||||
previous = [],
|
||||
next = BS.unpack bs
|
||||
}
|
||||
|
||||
word8Max :: Int
|
||||
word8Max = fromIntegral (maxBound :: Word8)
|
||||
|
||||
cutTo :: Int -> [a] -> [a]
|
||||
cutTo newLength xs = drop (max 0 . flip (-) newLength . length $ xs) xs
|
||||
|
||||
decode :: forall a . Integral a => [LengthDistancePair a] -> [Word8]
|
||||
decode = foldl' decodeIndividual []
|
||||
where
|
||||
decodeIndividual xs (Literal x) = xs ++ [x]
|
||||
decodeIndividual xs (Repeat {..})
|
||||
= (++) xs
|
||||
. take (fromIntegral matchLength)
|
||||
. drop (F.length xs - fromIntegral distance)
|
||||
$ xs
|
||||
|
||||
encode :: forall a . (Integral a, Bounded a) => [Word8] -> [LengthDistancePair a]
|
||||
encode xs = encode' xs xs 0
|
||||
where
|
||||
|
||||
encode' :: [Word8] -> [Word8] -> Int -> [LengthDistancePair a]
|
||||
encode' predictFrom [] n = []
|
||||
encode' predictFrom next 0 = []
|
||||
encode' predictFrom next distance = match : encode' (drop (maxLength - newDistance) predictFrom) (drop matchLength next) newDistance
|
||||
where
|
||||
(match, matchLength)
|
||||
= F.maximumBy (comparing snd)
|
||||
. map (\x -> (x, ldpLength x))
|
||||
. ( Literal (U.head next) : )
|
||||
. zipWith Repeat (reverse [1 .. (fromIntegral distance)])
|
||||
. map lengthMatchWithRepeat . tails
|
||||
. take distance
|
||||
$ predictFrom
|
||||
|
||||
|
||||
maxLength = fromIntegral (minBound :: a)
|
||||
|
||||
newDistance = max maxLength . (+ matchLength) $ distance
|
||||
|
||||
|
||||
lengthMatchWithRepeat cyclexs' = zipWithCycle' cyclexs' next
|
||||
where
|
||||
zipWithCycle' [] ys = zipWithCycle' cyclexs' ys
|
||||
zipWithCycle' (x:xs) (y:ys) | x == y = 1 + zipWithCycle' xs ys
|
||||
zipWithCycle' _ _ = 0
|
||||
|
||||
-- lengthMatchWithRepeat candidate
|
||||
-- = genericLength
|
||||
-- . takeWhile (uncurry (==)) . zip (cycle candidate)
|
||||
-- $ next
|
||||
|
||||
|
||||
|
||||
-- fasterEncode bs = encode' start start 0
|
||||
-- where
|
||||
-- encode' toPredict predictFrom maxDistance = (take diff previous)
|
||||
|
||||
-- start = BS.unpack bs take
|
||||
|
|
@ -17,9 +17,9 @@ import Data.PQueue qualified as PQ
|
|||
import Debug.Trace qualified as D
|
||||
import Debug.Trace qualified as T
|
||||
import GHC.Generics
|
||||
import qualified Data.Serialize as C
|
||||
import Data.FiniteBit
|
||||
import qualified Data.FiniteBit as B
|
||||
import qualified Codec.Winery as C
|
||||
|
||||
data Tree a = (Ord a) =>
|
||||
Tree
|
||||
|
|
@ -28,7 +28,8 @@ data Tree a = (Ord a) =>
|
|||
|
||||
newtype HuffmanPrefixTree a b = HuffmanPrefixTree
|
||||
{ inner :: M.Map a (HuffmanTree b)
|
||||
} deriving (Eq, Ord, Show, Generic, C.Serialize)
|
||||
} deriving (Eq, Ord, Show, Generic)
|
||||
deriving C.Serialise via C.WineryRecord (HuffmanPrefixTree a b)
|
||||
|
||||
finiteBitTupleUncons ::
|
||||
forall a b.
|
||||
|
|
@ -85,7 +86,7 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
|
|||
tree = toHuffmanTree . nGramCounts $ bs
|
||||
|
||||
treeDirMap :: M.Map a (M.Map b [TreeDir])
|
||||
treeDirMap = M.map HT.findTreeDirections . Compress.PrefixTree.inner $ tree
|
||||
treeDirMap = M.map HT.findTreeDirections tree.inner
|
||||
|
||||
initial :: Maybe a
|
||||
initial = fst <$> finiteBitUncons bs
|
||||
|
|
@ -96,37 +97,6 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
|
|||
treeDirsFor :: (a, b) -> [TreeDir]
|
||||
treeDirsFor (a, b) = (treeDirMap M.! a) M.! b
|
||||
|
||||
|
||||
|
||||
-- | all (M.null . children) . M.elems . children $ tree =
|
||||
-- fmap End
|
||||
-- . HT.fromList
|
||||
-- . map (\x -> (prefixCounts x, x))
|
||||
-- . M.keys
|
||||
-- . children
|
||||
-- $ tree
|
||||
-- | otherwise =
|
||||
-- Just
|
||||
-- . Layer
|
||||
-- . M.mapMaybeWithKey (\key val -> toHuffmanTree' (key : soFar) val)
|
||||
-- . children
|
||||
-- $ tree
|
||||
-- where
|
||||
-- prefixCounts :: a -> Int
|
||||
-- prefixCounts x =
|
||||
-- fromIntegral
|
||||
-- . sum
|
||||
-- . M.elems
|
||||
-- . M.filterWithKey (\key val -> L.isPrefixOf (reverse . (x :) $ soFar) key)
|
||||
-- $ nGrams
|
||||
|
||||
-- toHuffmanTree :: Tree a -> p1 -> HuffmanTree a
|
||||
-- toHuffmanTree :: forall a . Tree a -> M.Map [a] Word -> HuffmanTree [a]
|
||||
-- toHuffmanTree (Tree {..}) nGrams soFar | M.size children == 1 = Leaf . map (reverse . (: soFar)) . M.keys $ children
|
||||
-- toHuffmanTree (Tree {..}) nGrams soFar = Leaf . map (reverse . (: soFar)) . M.keys $ children
|
||||
-- where
|
||||
-- sorted = L.sortBy (prefixCounts . fst) . M.toList $ children
|
||||
|
||||
nGramCounts ::
|
||||
forall a b.
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
||||
|
|
@ -155,34 +125,5 @@ fromSingleList (x : xs) = Tree . M.singleton x . fromSingleList $ xs
|
|||
fromList :: (Ord a) => [[a]] -> Tree a
|
||||
fromList = F.foldl' merge Compress.PrefixTree.empty . map fromSingleList
|
||||
|
||||
-- insert :: Ord a => Tree a -> [a] -> Tree a
|
||||
-- insert (Tree {..}) (x:xs) =
|
||||
|
||||
merge :: Tree a -> Tree a -> Tree a
|
||||
merge (Tree children0) (Tree children1) = Tree $ M.unionWith merge children0 children1
|
||||
|
||||
-- deriving instance Eq (Tree a)
|
||||
|
||||
-- deriving instance Ord (Tree a)
|
||||
|
||||
-- deriving instance (Show a) => Show (Tree a)
|
||||
|
||||
-- empty :: (Ord a) => Tree a
|
||||
-- empty = Tree M.empty
|
||||
|
||||
-- fromList :: (Ord a, F.Foldable t) => t [a] -> Tree a
|
||||
-- fromList = F.foldl' insert empty
|
||||
|
||||
-- insert :: Tree a -> [a] -> Tree a
|
||||
-- insert (Tree {..}) [] = Tree M.empty
|
||||
-- insert (Tree {..}) (x : xs) =
|
||||
-- Tree
|
||||
-- . flip (M.insert x) children
|
||||
-- . flip insert xs
|
||||
-- . My.fromMaybe empty
|
||||
-- . M.lookup x
|
||||
-- $ children
|
||||
|
||||
-- lookup :: (Ord a) => [a] -> Tree a -> Bool
|
||||
-- lookup [] = const True
|
||||
-- lookup (x : xs) = maybe False (Compress.PrefixTree.lookup xs) . M.lookup x . children
|
||||
|
|
|
|||
8
src/Compress/WordMarkovStats.hs
Normal file
8
src/Compress/WordMarkovStats.hs
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
module Compress.WordMarkovStats where
|
||||
|
||||
|
||||
data WordMarkovStats = WordMarkovStats
|
||||
{ location :: Ratio Integer,
|
||||
size :: Ratio Integer
|
||||
}
|
||||
deriving (Show, Eq, Ord, Generic)
|
||||
62
src/Data/ArbitraryPrecisionFloatingPoint.hs
Normal file
62
src/Data/ArbitraryPrecisionFloatingPoint.hs
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
module Data.ArbitraryPrecisionFloatingPoint where
|
||||
|
||||
import Compress.Arithmetic (binarySearch)
|
||||
import Data.Function.Pointless
|
||||
|
||||
data FloatingPoint = FloatingPoint
|
||||
{ exponent :: Integer,
|
||||
significand :: Integer
|
||||
}
|
||||
deriving (Eq)
|
||||
|
||||
instance Num FloatingPoint where
|
||||
(+) :: FloatingPoint -> FloatingPoint -> FloatingPoint
|
||||
(+) = onEqualExponent (+)
|
||||
|
||||
(*) :: FloatingPoint -> FloatingPoint -> FloatingPoint
|
||||
(*) n0 n1 = correctExponent $ FloatingPoint (n0.exponent + n1.exponent) (n0.significand * n1.significand)
|
||||
|
||||
abs :: FloatingPoint -> FloatingPoint
|
||||
abs (FloatingPoint {..}) = FloatingPoint exponent (abs significand)
|
||||
|
||||
negate :: FloatingPoint -> FloatingPoint
|
||||
negate (FloatingPoint {..}) = FloatingPoint exponent (negate significand)
|
||||
|
||||
fromInteger :: Integer -> FloatingPoint
|
||||
fromInteger = correctExponent . FloatingPoint 0
|
||||
|
||||
signum :: FloatingPoint -> FloatingPoint
|
||||
signum = FloatingPoint 0 . signum . (.significand)
|
||||
|
||||
onEqualExponent :: (Integer -> Integer -> Integer) -> FloatingPoint -> FloatingPoint -> FloatingPoint
|
||||
onEqualExponent f n0 n1 = correctExponent . FloatingPoint n0'.exponent $ on f (.significand) n0' n1'
|
||||
where
|
||||
(n0', n1') = equalizeExponents n0 n1
|
||||
|
||||
-- | significand shouldn't have trailing 0s in binary
|
||||
-- | exponent should be one larger instead
|
||||
-- | this function corrects until siginicand isn't divisible by 2
|
||||
correctExponent :: FloatingPoint -> FloatingPoint
|
||||
correctExponent (FloatingPoint {..})
|
||||
| significand /= 0 && remainder == 0 = correctExponent $ FloatingPoint (succ exponent) quotient
|
||||
| otherwise = (FloatingPoint {..})
|
||||
where
|
||||
(quotient, remainder) = divMod significand 2
|
||||
|
||||
equalizeExponents :: FloatingPoint -> FloatingPoint -> (FloatingPoint, FloatingPoint)
|
||||
equalizeExponents n0 n1 =
|
||||
( withDiff n0 exponent0Diff,
|
||||
withDiff n1 exponent1Diff
|
||||
)
|
||||
where
|
||||
withDiff n exponentDiff = FloatingPoint (n.exponent + exponentDiff) (n.significand * (2 ^ exponentDiff))
|
||||
|
||||
signedExponentDiff = n0.exponent - n1.exponent
|
||||
|
||||
(exponent0Diff, exponent1Diff)
|
||||
| signedExponentDiff < 0 = (0, abs signedExponentDiff)
|
||||
| otherwise = (signedExponentDiff, 0)
|
||||
|
||||
instance Ord FloatingPoint where
|
||||
compare :: FloatingPoint -> FloatingPoint -> Ordering
|
||||
compare = uncurry (on compare (.significand)) .: equalizeExponents
|
||||
52
src/Data/Dirs.hs
Normal file
52
src/Data/Dirs.hs
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
-- |
|
||||
|
||||
module Data.Dirs where
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.Vector.Unboxed as VU
|
||||
import qualified Data.Vector as V
|
||||
import qualified Data.Bit as VB
|
||||
import qualified Data.Bit as BV
|
||||
import Data.Ratio
|
||||
import Control.Monad.Par qualified as P
|
||||
import Codec.Winery qualified as C
|
||||
import qualified Relude as R
|
||||
|
||||
data Dir = Higher | Lower deriving Show
|
||||
|
||||
data SerializedDirs = SerializedDirs {
|
||||
bs :: BS.ByteString,
|
||||
length :: Int
|
||||
} deriving (Generic, Show, Ord, Eq, P.NFData, C.Serialise)
|
||||
|
||||
-- toBS :: [Dir] -> SerializedDirs
|
||||
serialize dirs = SerializedDirs {..}
|
||||
where
|
||||
bs = VB.cloneToByteString . VU.fromList . map dirToBit $ dirs
|
||||
length = R.length dirs
|
||||
|
||||
deserialize :: SerializedDirs -> Rational
|
||||
deserialize (SerializedDirs {..}) = fromDirs . map bitToDir . VU.toList $ asVec
|
||||
where
|
||||
asVec = VU.take length . VB.cloneFromByteString $ bs
|
||||
|
||||
addNewBit x Higher = (x * 2) + 1
|
||||
addNewBit x Lower = x * 2
|
||||
|
||||
fromDirs = sum . (0.5 :) . zipWith (*) fractions . map dirToNum
|
||||
where
|
||||
|
||||
addNewBit x Higher = (x * 2) + 1
|
||||
addNewBit x Lower = x * 2
|
||||
|
||||
|
||||
dirToNum Higher = 1
|
||||
dirToNum Lower = -1
|
||||
|
||||
bitToDir (BV.Bit True ) = Higher
|
||||
bitToDir (BV.Bit False) = Lower
|
||||
|
||||
dirToBit Higher = BV.Bit True
|
||||
dirToBit Lower = BV.Bit False
|
||||
|
||||
fractions :: [Rational]
|
||||
fractions = iterate ( / 2) (1 % 4)
|
||||
|
|
@ -9,6 +9,9 @@ import Data.Vector.Unboxed qualified as V
|
|||
import Data.Bit qualified as B
|
||||
import Data.Bit qualified as BV
|
||||
import Data.ByteString qualified as BS
|
||||
import qualified Codec.Winery as C
|
||||
import Data.FiniteBit (toByteString)
|
||||
import Codec.Winery (serialise)
|
||||
|
||||
data HuffmanTree a
|
||||
= Leaf a
|
||||
|
|
@ -16,24 +19,29 @@ data HuffmanTree a
|
|||
{ left :: HuffmanTree a,
|
||||
right :: HuffmanTree a
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic, C.Serialize, Functor)
|
||||
deriving (Eq, Ord, Show, Generic, Functor)
|
||||
deriving C.Serialise via (C.WineryVariant (HuffmanTree a))
|
||||
|
||||
|
||||
-- here so we can define our own Serialize instance
|
||||
newtype TreeDirs = TreeDirs {
|
||||
inner :: [TreeDir]
|
||||
} deriving (Eq, Ord, Show)
|
||||
-- deriving C.Serialize via C.WineryVariant TreeDirs
|
||||
|
||||
data TreeDir = L | R deriving (Eq, Ord, Show)
|
||||
|
||||
instance C.Serialize TreeDirs where
|
||||
put :: C.Putter TreeDirs
|
||||
put = C.put . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) . (inner :: TreeDirs -> [TreeDir])
|
||||
newtype SerializedTreeDirs = SerializedTreeDirs
|
||||
{
|
||||
inner :: BS.ByteString
|
||||
} deriving (Generic, Eq, Ord)
|
||||
deriving C.Serialise via (C.WineryRecord SerializedTreeDirs)
|
||||
|
||||
get :: C.Get TreeDirs
|
||||
get = do
|
||||
(bs, len) <- C.get
|
||||
pure . TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
|
||||
serializeTreeDirs (TreeDirs inner) = SerializedTreeDirs . fst . Bi.first BV.cloneToByteString . (\x -> (x, V.length x)) . V.fromList . map (BV.Bit . (== R)) $ inner
|
||||
|
||||
unserializeTreeDirs (SerializedTreeDirs bs) = TreeDirs . map (\x -> if BV.unBit x then R else L) . V.toList . V.take len . BV.cloneFromByteString $ bs
|
||||
where
|
||||
len = BS.length bs
|
||||
|
||||
lookup ::
|
||||
forall a.
|
||||
|
|
|
|||
|
|
@ -32,10 +32,10 @@ maxView :: PQueue a -> Maybe (Int, (a, PQueue a))
|
|||
maxView = abstractView IM.lookupMax
|
||||
|
||||
insert :: Int -> p -> PQueue p -> PQueue p
|
||||
insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . toMap
|
||||
insert key a = PQueue . IM.insertWith NE.append key (NE.singleton a) . (.toMap)
|
||||
|
||||
elems :: PQueue a -> [a]
|
||||
elems = concatMap NE.toList . IM.elems . toMap
|
||||
elems = concatMap NE.toList . IM.elems . (.toMap)
|
||||
|
||||
keys :: PQueue a -> [IM.Key]
|
||||
keys = IM.keys . toMap
|
||||
keys = IM.keys . (.toMap)
|
||||
|
|
|
|||
|
|
@ -1,46 +1,62 @@
|
|||
-- | note to myself. Something's wrong here.
|
||||
-- | it doesn't work properly when fed into Compress.Arithmetic
|
||||
|
||||
module Data.Word4 where
|
||||
|
||||
import Data.Bits
|
||||
import Data.Word
|
||||
import qualified Data.Serialize as C
|
||||
import qualified Codec.Winery as C
|
||||
import qualified Data.Map as M
|
||||
import qualified Text.Show as S
|
||||
import qualified Relude.Unsafe as U
|
||||
import Codec.Winery (WineryRecord)
|
||||
|
||||
-- only use lower 4 bits
|
||||
newtype Word4 = Word4 {
|
||||
inner :: Word8
|
||||
} deriving (Eq, Ord, Show, Enum, Generic, C.Serialize)
|
||||
} deriving (Eq, Ord, Enum, Generic)
|
||||
deriving C.Serialise via C.WineryRecord Word4
|
||||
|
||||
instance S.Show Word4 where
|
||||
show = show . (.inner)
|
||||
|
||||
|
||||
map2 :: (Word8 -> Word8 -> Word8) -> Word4 -> Word4 -> Word4
|
||||
map2 f (Word4 a) (Word4 b) = Word4 $ f a b
|
||||
|
||||
data CompressWord4List = CompressWord4List
|
||||
{ xs :: [Word8],
|
||||
last :: Maybe Word8 -- in lower 4 bits
|
||||
} deriving (Eq, Ord, Generic, Show, C.Serialize, NFData)
|
||||
newtype CompressWord4List = CompressWord4List
|
||||
{ xs :: [Word8]
|
||||
} deriving (Eq, Ord, Show, Generic)
|
||||
deriving C.Serialise via C.WineryVariant CompressWord4List
|
||||
|
||||
bitwiseAnd a b = getAnd $ And a <> And b
|
||||
|
||||
bitwiseOr a b = getIor $ Ior a <> Ior b
|
||||
|
||||
toCompressedWord4List [] = CompressWord4List [] Nothing
|
||||
toCompressedWord4List [Word4 x] = CompressWord4List [] (Just x)
|
||||
toCompressedWord4List ((Word4 x) : (Word4 x') : xs) =
|
||||
CompressWord4List
|
||||
{ xs = headByte : xs',
|
||||
last = last'
|
||||
}
|
||||
where
|
||||
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
|
||||
|
||||
(CompressWord4List xs' last') = toCompressedWord4List xs
|
||||
toCompressedWord4List = toCompressedWord4List' . U.init
|
||||
where
|
||||
toCompressedWord4List' [] = CompressWord4List []
|
||||
toCompressedWord4List' [Word4 x] = CompressWord4List [x .<<. 4]
|
||||
toCompressedWord4List' ((Word4 x) : (Word4 x') : xs) =
|
||||
CompressWord4List
|
||||
{ xs = headByte : xs'
|
||||
}
|
||||
where
|
||||
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
|
||||
|
||||
(CompressWord4List xs') = toCompressedWord4List' xs
|
||||
|
||||
|
||||
fromCompressedWord4List :: CompressWord4List -> [Word4]
|
||||
fromCompressedWord4List (CompressWord4List [] Nothing) = []
|
||||
fromCompressedWord4List (CompressWord4List [] (Just a)) = [word4 a]
|
||||
fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
|
||||
Word4 ((bitwiseAnd 0xf0 x) .>>. 4)
|
||||
: word4 x
|
||||
: fromCompressedWord4List (CompressWord4List xs last)
|
||||
fromCompressedWord4List = fromCompressedWord4List' . (.xs)
|
||||
where
|
||||
fromCompressedWord4List' :: [Word8] -> [Word4]
|
||||
fromCompressedWord4List' [] = []
|
||||
fromCompressedWord4List' (x : xs) =
|
||||
word4 (x .>>. 4)
|
||||
: word4 x
|
||||
: fromCompressedWord4List' xs
|
||||
|
||||
-- instance Show Word4 where
|
||||
-- show :: Word4 -> String
|
||||
|
|
@ -48,10 +64,10 @@ fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
|
|||
|
||||
instance Bounded Word4 where
|
||||
minBound = word4 0
|
||||
maxBound = word4 0xf
|
||||
maxBound = word4 15
|
||||
|
||||
instance Real Word4 where
|
||||
toRational = toRational . inner
|
||||
toRational = toRational . (.inner)
|
||||
|
||||
instance Integral Word4 where
|
||||
quot = map2 quot `on` clean
|
||||
|
|
@ -66,9 +82,9 @@ instance Integral Word4 where
|
|||
divMod (Word4 a) (Word4 b) = (word4 a', word4 b')
|
||||
where
|
||||
(a', b') = divMod a b
|
||||
toInteger = toInteger . inner . clean
|
||||
toInteger = toInteger . (.inner) . clean
|
||||
|
||||
clean (Word4 a) = Word4 . bitwiseAnd 0xf $ a
|
||||
clean (Word4 a) = Word4 . bitwiseAnd 0x0f $ a
|
||||
word4 = clean . Word4
|
||||
|
||||
instance Num Word4 where
|
||||
|
|
|
|||
71
src/Main.hs
71
src/Main.hs
|
|
@ -1,71 +0,0 @@
|
|||
module Main where
|
||||
|
||||
import Compress.Huffman
|
||||
import Compress.Huffman qualified as HT
|
||||
import Compress.PrefixTree (HuffmanPrefixTree (HuffmanPrefixTree))
|
||||
import Compress.PrefixTree qualified as PT
|
||||
import Compress.Arithmetic qualified as A
|
||||
import Data.Bifunctor qualified as Bi
|
||||
import Data.Bit qualified as B
|
||||
import Data.Bits (Bits ((.|.)))
|
||||
import Data.Bits qualified as B
|
||||
import Data.ByteString qualified as BS
|
||||
import Data.HuffmanTree
|
||||
import Data.Proxy qualified as P
|
||||
import Data.Serialize qualified as C
|
||||
import Data.Word
|
||||
import GHC.Generics (Generic)
|
||||
import Options.Generic qualified as O
|
||||
import qualified Data.FiniteBit as FB
|
||||
|
||||
data CompressOrDecompress = Compress | Decompress deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
||||
|
||||
data CompressionStrategy = Huffman | MarkovHuffman deriving (Show, Generic, O.ParseField, O.ParseFields, O.ParseRecord, Read)
|
||||
|
||||
data CLIOpts = CLIOpts
|
||||
{ task :: CompressOrDecompress,
|
||||
strategy :: CompressionStrategy
|
||||
}
|
||||
deriving (Show, Generic, O.ParseRecord)
|
||||
|
||||
applyCompressionOptions ::
|
||||
forall a.
|
||||
(Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialize a) =>
|
||||
P.Proxy a ->
|
||||
CLIOpts ->
|
||||
BS.ByteString ->
|
||||
BS.ByteString
|
||||
applyCompressionOptions _ (CLIOpts Compress Huffman) f =
|
||||
C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree a)) $ f
|
||||
applyCompressionOptions _ (CLIOpts Compress MarkovHuffman) f =
|
||||
C.encode . (PT.compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
||||
applyCompressionOptions _ (CLIOpts Decompress Huffman) f =
|
||||
handleError $ Bi.second decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanTree a)) $ f
|
||||
applyCompressionOptions _ (CLIOpts Decompress MarkovHuffman) f =
|
||||
handleError $ Bi.second PT.decompress . (C.decode :: BS.ByteString -> Either String (TreeDirs, HuffmanPrefixTree a a, a)) $ f
|
||||
|
||||
handleError (Right (Just bs)) = bs
|
||||
handleError _ = []
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
|
||||
f <- BS.getContents
|
||||
let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
|
||||
let compressedUnencoded :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) = (A.compress fAsWords)
|
||||
let compressed = C.encode compressedUnencoded
|
||||
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
|
||||
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
|
||||
|
||||
-- let f = "hello tehre"
|
||||
-- f <- BS.readFile "pg64317.txt"
|
||||
-- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f
|
||||
-- print $ BS.length . C.encode $ compressed
|
||||
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word8)) $ f
|
||||
-- print $ BS.length . C.encode . (compress :: BS.ByteString -> Maybe (TreeDirs, HuffmanTree Word16)) $ f
|
||||
-- BS.writeFile "outin.txt" decompressed
|
||||
-- print (decompressed, f)
|
||||
-- print $ BS.length decompressed
|
||||
-- print $ BS.length f
|
||||
-- print (decompressed == f)
|
||||
-- BS.putStr . applyCompressionOptions (P.Proxy :: P.Proxy Word16) cliOpts $ f
|
||||
92
test/Test.hs
Normal file
92
test/Test.hs
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
import Compress.Arithmetic (twoByteMarkov)
|
||||
import Compress.Arithmetic qualified as A
|
||||
import Compress.LengthDistancePairs as LDP
|
||||
import Compress.WordMarkovStats as WM
|
||||
import Data.Bifunctor qualified as Bi
|
||||
import Data.ByteString qualified as BS
|
||||
import Data.Dirs as D
|
||||
import Data.Map.Strict qualified as M
|
||||
import Data.Ratio
|
||||
import Test.Falsify.Generator qualified as Gen
|
||||
import Test.Falsify.Predicate as P
|
||||
import Test.Falsify.Property qualified as P
|
||||
import Test.Falsify.Range qualified as Range
|
||||
import Test.Tasty
|
||||
import Test.Tasty.Falsify as F
|
||||
import Test.Tasty.HUnit
|
||||
|
||||
-- import qualified GHC.Enum as B
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
toCompress <- -- BS.take 12000 <$>
|
||||
BS.readFile "pg64317.txt"
|
||||
defaultMain (tests toCompress)
|
||||
|
||||
tests toCompress =
|
||||
testGroup
|
||||
"falsify"
|
||||
[ testGroup
|
||||
"Arithmetic"
|
||||
[ testCase "works" $
|
||||
A.twoByteMarkov ([0, 1, 0, 1, 0, 1, 0, 1, 0, 9] :: [Word8])
|
||||
@?= M.fromList [(0, [(1, 0.8), (9, 0.2)]), (1, [(0, 1)])],
|
||||
testCase "relativeCounts works as expected with one param" $
|
||||
A.relativeCounts [(0, 30)]
|
||||
@?= [(0, 1)],
|
||||
testCase "relativeCounts works as expected with lots of params" $
|
||||
A.relativeCounts [(0, 30), (1, 20)]
|
||||
@?= [(0, 30 % 50), (1, 20 % 50)],
|
||||
testCase "toRing locs all less than 1" $
|
||||
assertBool "larger than one" $
|
||||
all (all ((<= 1) . (.location))) toCompressRing,
|
||||
testCase "toRing sizes all add up to 1" $
|
||||
assertBool "larger than 1" $
|
||||
all ((== 1) . sum . map (.size)) toCompressRing,
|
||||
testCase "toRing gives no zero sizes" $
|
||||
assertBool "== 0" $
|
||||
all (all ((/= 0) . (.size))) toCompressRing,
|
||||
F.testProperty "binary search" propBinarySearchWithinBounds,
|
||||
F.testProperty "compress and decompress isomorphism" (propCompressDecompressIsomorphism (A.decompress . A.compress))
|
||||
]
|
||||
-- testGroup
|
||||
-- "LengthDistancePair"
|
||||
-- [ F.testProperty "compress and decompress isomorphism" $ (propCompressDecompressIsomorphism (LDP.decode . (LDP.encode :: [Word8] -> [LDP.LengthDistancePair Word32])))
|
||||
-- ]
|
||||
]
|
||||
where
|
||||
toCompressRing =
|
||||
map (M.elems . A.toRing . map (Bi.second (A.discretizeFraction :: Rational -> Word8)))
|
||||
. M.elems
|
||||
. A.twoByteMarkov
|
||||
. BS.unpack
|
||||
$ toCompress
|
||||
|
||||
wordMaxBound :: Integer
|
||||
wordMaxBound = fromIntegral (maxBound :: Word)
|
||||
|
||||
genProperFraction = F.gen . fmap ((% wordMaxBound) . fromIntegral) . Gen.inRange $ (Range.between (0, maxBound :: Word))
|
||||
|
||||
-- propBinarySearchWithinBounds :: Property' String ()
|
||||
propBinarySearchWithinBounds = do
|
||||
bound1 <- genProperFraction
|
||||
epsilon' <- genProperFraction
|
||||
let epsilon = max (1 % (fromIntegral (maxBound :: Word))) . (* 0.1) $ bound1 * epsilon'
|
||||
let bound2 = if bound1 + epsilon < 1 then bound1 + epsilon else bound1 - epsilon
|
||||
-- let bound2 = min 1 $ bound1 + epsilon
|
||||
let iso = D.deserialize . D.serialize $ A.binarySearch bound1 bound2
|
||||
P.assert $ P.le .$ ("minimum", min bound1 bound2) .$ ("iso", iso)
|
||||
P.assert $ P.ge .$ ("maximum", max bound1 bound2) .$ ("iso", iso)
|
||||
|
||||
-- P.assert $ Range.between (min bound1 bound2) (max bound1 bound2) .$ (A.fromDirs $ A.binarySearch bound1 bound2)
|
||||
|
||||
-- propCompressDecompressIsomorphism :: F.Property ()
|
||||
-- propCompressDecompressIsomorphism :: Property' String ()
|
||||
propCompressDecompressIsomorphism :: ([Word8] -> [Word8]) -> Property' String ()
|
||||
propCompressDecompressIsomorphism iso = do
|
||||
(xs :: [Word8]) <-
|
||||
F.gen $ Gen.list (Range.between (4, 5000)) intGen
|
||||
P.assert $ eq .$ ("xs", xs) .$ ("changed", iso $ xs)
|
||||
where
|
||||
intGen :: F.Gen Word8
|
||||
intGen = Gen.inRange $ Range.between (0, fromIntegral (maxBound :: Word8))
|
||||
Loading…
Add table
Add a link
Reference in a new issue