swap to word4 instead of word8 sizes
This isn't beter yet, but I have a hunch it's to to the Maybe in the compressed word4 list type, which I can get rid of
This commit is contained in:
parent
46749895bc
commit
2123636291
9 changed files with 400 additions and 148 deletions
|
|
@ -1,4 +1,4 @@
|
|||
cabal-version: 3.0
|
||||
cabal-version: 3.4
|
||||
name: compress
|
||||
version: 0.1.0.0
|
||||
category: Web
|
||||
|
|
@ -41,21 +41,28 @@ executable compress
|
|||
UndecidableInstances,
|
||||
hs-source-dirs:
|
||||
src
|
||||
mixins:
|
||||
base hiding (Prelude),
|
||||
relude (Relude as Prelude),
|
||||
relude
|
||||
build-depends:
|
||||
base,
|
||||
basement,
|
||||
bitvec,
|
||||
bytestring,
|
||||
cereal,
|
||||
containers,
|
||||
-- accelerate,
|
||||
-- containers-accelerate,
|
||||
leancheck,
|
||||
monad-par,
|
||||
monad-par-extras,
|
||||
nonempty-containers,
|
||||
optparse-generic,
|
||||
relude,
|
||||
text,
|
||||
uuid,
|
||||
optparse-generic,
|
||||
vector,
|
||||
nonempty-containers,
|
||||
witch,
|
||||
monad-par,
|
||||
monad-par-extras
|
||||
witch
|
||||
default-language:
|
||||
GHC2021
|
||||
other-modules:
|
||||
|
|
@ -65,7 +72,7 @@ executable compress
|
|||
Compress.PrefixTree
|
||||
Data.HuffmanTree
|
||||
Compress.Arithmetic
|
||||
Data.WordyMap
|
||||
Data.Word4
|
||||
ghc-options:
|
||||
-threaded
|
||||
-fprof-auto
|
||||
|
|
|
|||
105
flake.lock
generated
105
flake.lock
generated
|
|
@ -1,5 +1,21 @@
|
|||
{
|
||||
"nodes": {
|
||||
"accelerate": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1732969010,
|
||||
"narHash": "sha256-Qrmtrgij2GbklBXUK42Pt6Db8WiGijA5sz5oC5AR72c=",
|
||||
"owner": "AccelerateHS",
|
||||
"repo": "accelerate",
|
||||
"rev": "02da6161ef143a9886c8bce542cd96029c4f527a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "AccelerateHS",
|
||||
"repo": "accelerate",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
|
|
@ -18,13 +34,28 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-root": {
|
||||
"locked": {
|
||||
"lastModified": 1723604017,
|
||||
"narHash": "sha256-rBtQ8gg+Dn4Sx/s+pvjdq3CB2wQNzx9XGFq/JVGCB6k=",
|
||||
"owner": "srid",
|
||||
"repo": "flake-root",
|
||||
"rev": "b759a56851e10cb13f6b8e5698af7b59c44be26e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "srid",
|
||||
"repo": "flake-root",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"haskell-flake": {
|
||||
"locked": {
|
||||
"lastModified": 1734464164,
|
||||
"narHash": "sha256-5JCCyrgy7IMnipyYMQzIAXncGt2XVlW1aK71A+FTXDs=",
|
||||
"lastModified": 1734984991,
|
||||
"narHash": "sha256-oUYtRBD3Yhw2jvKYo0lfd82fgEQQbFoiJcHO923gmOc=",
|
||||
"owner": "srid",
|
||||
"repo": "haskell-flake",
|
||||
"rev": "e280b39efdd72b6a5bdaa982b67f150c819be642",
|
||||
"rev": "daf00052906bdd977e57a07f7048437214232e87",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
@ -33,13 +64,64 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"llvm-hs": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1665495332,
|
||||
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
|
||||
"owner": "llvm-hs",
|
||||
"repo": "llvm-hs",
|
||||
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "llvm-hs",
|
||||
"ref": "llvm-12",
|
||||
"repo": "llvm-hs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"llvm-hs-pure": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"dir": "llvm-hs-pure",
|
||||
"lastModified": 1665495332,
|
||||
"narHash": "sha256-JKrpUmHJ1nsNiCoHhV5FCcdQGlNFfD37Oiu5kSmghfM=",
|
||||
"owner": "llvm-hs",
|
||||
"repo": "llvm-hs",
|
||||
"rev": "423220bffac4990d019fc088c46c5f25310d5a33",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"dir": "llvm-hs-pure",
|
||||
"owner": "llvm-hs",
|
||||
"ref": "llvm-12",
|
||||
"repo": "llvm-hs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"mission-control": {
|
||||
"locked": {
|
||||
"lastModified": 1733438716,
|
||||
"narHash": "sha256-1tt43rwHk0N5fwEhbpsHWO4nBVFCQN0w1KM427DNycM=",
|
||||
"owner": "Platonic-Systems",
|
||||
"repo": "mission-control",
|
||||
"rev": "65d04c4ab9db076eff09824d2936a5c215c21f36",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "Platonic-Systems",
|
||||
"repo": "mission-control",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1734424634,
|
||||
"narHash": "sha256-cHar1vqHOOyC7f1+tVycPoWTfKIaqkoe1Q6TnKzuti4=",
|
||||
"lastModified": 1734649271,
|
||||
"narHash": "sha256-4EVBRhOjMDuGtMaofAIqzJbg4Ql7Ai0PSeuVZTHjyKQ=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d3c42f187194c26d9f0309a8ecc469d6c878ce33",
|
||||
"rev": "d70bd19e0a38ad4790d3913bf08fcbfc9eeca507",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
@ -63,8 +145,13 @@
|
|||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"accelerate": "accelerate",
|
||||
"flake-parts": "flake-parts",
|
||||
"flake-root": "flake-root",
|
||||
"haskell-flake": "haskell-flake",
|
||||
"llvm-hs": "llvm-hs",
|
||||
"llvm-hs-pure": "llvm-hs-pure",
|
||||
"mission-control": "mission-control",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"systems": "systems",
|
||||
"treefmt-nix": "treefmt-nix"
|
||||
|
|
@ -92,11 +179,11 @@
|
|||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1733761991,
|
||||
"narHash": "sha256-s4DalCDepD22jtKL5Nw6f4LP5UwoMcPzPZgHWjAfqbQ=",
|
||||
"lastModified": 1734982074,
|
||||
"narHash": "sha256-N7M37KP7cHWoXicuE536GrVvU8nMDT/gpI1kja2hkdg=",
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"rev": "0ce9d149d99bc383d1f2d85f31f6ebd146e46085",
|
||||
"rev": "e41e948cf097cbf96ba4dff47a30ea6891af9f33",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
|
|||
179
flake.nix
179
flake.nix
|
|
@ -5,87 +5,142 @@
|
|||
systems.url = "github:nix-systems/default";
|
||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||
haskell-flake.url = "github:srid/haskell-flake";
|
||||
flake-root.url = "github:srid/flake-root";
|
||||
treefmt-nix.url = "github:numtide/treefmt-nix";
|
||||
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
|
||||
accelerate = {
|
||||
url = "github:AccelerateHS/accelerate?submodules=1";
|
||||
flake = false;
|
||||
};
|
||||
llvm-hs = {
|
||||
url = "github:llvm-hs/llvm-hs?ref=llvm-12";
|
||||
flake = false;
|
||||
};
|
||||
llvm-hs-pure = {
|
||||
url = "github:llvm-hs/llvm-hs?dir=llvm-hs-pure&ref=llvm-12";
|
||||
flake = false;
|
||||
};
|
||||
mission-control.url = "github:Platonic-Systems/mission-control";
|
||||
};
|
||||
|
||||
outputs = inputs:
|
||||
inputs.flake-parts.lib.mkFlake { inherit inputs; } {
|
||||
systems = import inputs.systems;
|
||||
imports = [
|
||||
inputs.haskell-flake.flakeModule
|
||||
inputs.treefmt-nix.flakeModule
|
||||
];
|
||||
perSystem = { self', system, lib, config, pkgs, ... }: {
|
||||
# Our only Haskell project. You can have multiple projects, but this template
|
||||
# has only one.
|
||||
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
|
||||
haskellProjects.default = {
|
||||
# The base package set (this value is the default)
|
||||
basePackages = pkgs.haskell.packages.ghc96;
|
||||
inputs.flake-parts.lib.mkFlake { inherit inputs; } {
|
||||
systems = import inputs.systems;
|
||||
imports = [
|
||||
inputs.flake-root.flakeModule
|
||||
inputs.haskell-flake.flakeModule
|
||||
inputs.treefmt-nix.flakeModule
|
||||
inputs.mission-control.flakeModule
|
||||
];
|
||||
perSystem = { self', system, lib, config, pkgs, ... }: {
|
||||
# Our only Haskell project. You can have multiple projects, but this template
|
||||
# has only one.
|
||||
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
|
||||
haskellProjects.default = {
|
||||
# The base package set (this value is the default)
|
||||
basePackages = pkgs.haskell.packages.ghc96;
|
||||
|
||||
# Packages to add on top of `basePackages`
|
||||
packages = {
|
||||
# Add source or Hackage overrides here
|
||||
# (Local packages are added automatically)
|
||||
# https://github.com/lehins/hip.git
|
||||
# hip.source = inputs.hip + "/hip";
|
||||
};
|
||||
# Packages to add on top of `basePackages`
|
||||
packages = {
|
||||
# Add source or Hackage overrides here
|
||||
# (Local packages are added automatically)
|
||||
# https://github.com/lehins/hip.git
|
||||
# hip.source = inputs.hip + "/hip";
|
||||
accelerate.source = inputs.accelerate;
|
||||
# llvm-hs.source = inputs.llvm-hs;
|
||||
llvm-hs-pure.source = inputs.llvm-hs-pure;
|
||||
};
|
||||
|
||||
# Add your package overrides here
|
||||
settings = {
|
||||
uuid ={
|
||||
# Add your package overrides here
|
||||
settings = {
|
||||
uuid.jailbreak = true;
|
||||
accelerate =
|
||||
{
|
||||
jailbreak = true;
|
||||
broken = false;
|
||||
};
|
||||
llvm-hs =
|
||||
{
|
||||
jailbreak = true;
|
||||
broken = false;
|
||||
};
|
||||
llvm-hs-pure =
|
||||
{
|
||||
jailbreak = true;
|
||||
broken = false;
|
||||
};
|
||||
|
||||
# hlint = {
|
||||
# jailbreak = true;
|
||||
# };
|
||||
# barbies-th = {
|
||||
};
|
||||
# hlint = {
|
||||
# jailbreak = true;
|
||||
# };
|
||||
# barbies-th = {
|
||||
# broken = false;
|
||||
# jailbreak = true;
|
||||
# };
|
||||
|
||||
# Development shell configuration
|
||||
devShell = {
|
||||
hlsCheck.enable = true;
|
||||
};
|
||||
|
||||
# What should haskell-flake add to flake outputs?
|
||||
autoWire = [ "packages" "apps" "checks" ]; # Wire all but the devShell
|
||||
};
|
||||
|
||||
# Auto formatters. This also adds a flake check to ensure that the
|
||||
# source tree was auto formatted.
|
||||
treefmt.config = {
|
||||
projectRootFile = "flake.nix";
|
||||
|
||||
programs.ormolu.enable = true;
|
||||
programs.nixpkgs-fmt.enable = true;
|
||||
programs.cabal-fmt.enable = true;
|
||||
programs.hlint.enable = false;
|
||||
|
||||
# We use fourmolu
|
||||
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
|
||||
};
|
||||
|
||||
# Default package & app.
|
||||
packages.default = self'.packages.compress;
|
||||
apps.default = self'.apps.compress;
|
||||
|
||||
mission-control.scripts = {
|
||||
hoogle = {
|
||||
description = "Start Hoogle server for project dependencies";
|
||||
exec = ''
|
||||
echo http://127.0.0.1:8888;
|
||||
hoogle serve -p 8888 --local;
|
||||
'';
|
||||
category = "Dev Tools";
|
||||
};
|
||||
|
||||
# Development shell configuration
|
||||
devShell = {
|
||||
hlsCheck.enable = true;
|
||||
haddocks = {
|
||||
description = "make docs & serve them";
|
||||
exec = ''
|
||||
echo http://127.0.0.1:8887;
|
||||
cabal haddock-project --executables --internal --hoogle || true;
|
||||
python3 -m http.server -d haddocks 8887;
|
||||
'';
|
||||
category = "Dev Tools";
|
||||
};
|
||||
|
||||
# What should haskell-flake add to flake outputs?
|
||||
autoWire = [ "packages" "apps" "checks" ]; # Wire all but the devShell
|
||||
};
|
||||
|
||||
# Auto formatters. This also adds a flake check to ensure that the
|
||||
# source tree was auto formatted.
|
||||
treefmt.config = {
|
||||
projectRootFile = "flake.nix";
|
||||
|
||||
programs.ormolu.enable = true;
|
||||
programs.nixpkgs-fmt.enable = true;
|
||||
programs.cabal-fmt.enable = true;
|
||||
programs.hlint.enable = false;
|
||||
|
||||
# We use fourmolu
|
||||
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
|
||||
};
|
||||
|
||||
# Default package & app.
|
||||
packages.default = self'.packages.compress;
|
||||
apps.default = self'.apps.compress;
|
||||
|
||||
# Default shell.
|
||||
devShells.default = pkgs.mkShell {
|
||||
name = "haskell-template";
|
||||
meta.description = "Haskell development environment";
|
||||
# See https://zero-to-flakes.com/haskell-flake/devshell#composing-devshells
|
||||
inputsFrom = [
|
||||
config.haskellProjects.default.outputs.devShell
|
||||
config.treefmt.build.devShell
|
||||
];
|
||||
nativeBuildInputs = with pkgs; [
|
||||
];
|
||||
};
|
||||
# Default shell.
|
||||
devShells.default = pkgs.mkShell {
|
||||
name = "haskell-template";
|
||||
meta.description = "Haskell development environment";
|
||||
# See https://zero-to-flakes.com/haskell-flake/devshell#composing-devshells
|
||||
inputsFrom = [
|
||||
config.mission-control.devShell
|
||||
config.haskellProjects.default.outputs.devShell
|
||||
config.treefmt.build.devShell
|
||||
];
|
||||
nativeBuildInputs = with pkgs; [
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,13 +1,15 @@
|
|||
-- | https://en.wikipedia.org/wiki/Arithmetic_coding
|
||||
module Compress.Arithmetic where
|
||||
module Compress.Arithmetic (Compressed, compress, decompress, Word4MapSerialized) where
|
||||
|
||||
import Basement.Bits qualified as B
|
||||
import Basement.Compat.Base (Word16, Word32, Word64, Word8)
|
||||
import Basement.Compat.Bifunctor qualified as Bi
|
||||
import Data.Bits qualified as B
|
||||
import Data.Word
|
||||
import Data.Bifunctor qualified as Bi
|
||||
import Control.Arrow qualified as Ar
|
||||
import Control.Monad.Par (NFData)
|
||||
import Control.Monad.Par qualified as P
|
||||
import Data.ByteString qualified as By
|
||||
import Data.FiniteBit qualified as Fi
|
||||
import Data.Foldable as F
|
||||
import qualified Data.Foldable as F
|
||||
import Data.List (genericLength)
|
||||
import Data.List qualified as L
|
||||
import Data.Map.Strict qualified as M
|
||||
|
|
@ -17,29 +19,34 @@ import Data.Ratio
|
|||
import Data.Serialize qualified as C
|
||||
import GHC.Generics (Generic)
|
||||
import GHC.Natural (Natural)
|
||||
import qualified Control.Monad.Par as P
|
||||
import Control.Monad.Par (NFData)
|
||||
import qualified Relude.Unsafe as US
|
||||
import Data.Word4 (Word4, CompressWord4List, toCompressedWord4List, fromCompressedWord4List)
|
||||
import qualified Data.ByteString as BS
|
||||
|
||||
-- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes
|
||||
|
||||
-- shortestNumberBetween :: Ratio Integer -> Ratio Integer -> Maybe (Ratio Integer)
|
||||
-- shortestNumberBetween xMin xMax = find (\candidate -> candidate >= xMin && candidate < xMax) withPrimeDenums
|
||||
|
||||
|
||||
data WordMarkovStats = WordMarkovStats
|
||||
{ location :: Ratio Integer,
|
||||
size :: Ratio Integer
|
||||
|
||||
}
|
||||
deriving (Show, Eq, Ord, Generic, C.Serialize)
|
||||
|
||||
|
||||
loc' (WordMarkovStats {..}) = location
|
||||
|
||||
toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats
|
||||
toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats
|
||||
where
|
||||
|
||||
sizes = map ((% maxBound') . fromIntegral . snd) xs
|
||||
|
||||
wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized
|
||||
|
||||
asFracsOfTotalSum = map fst . tail . L.scanl' f (0, 0) $ sizes
|
||||
asFracsOfTotalSum = map fst . US.tail . L.scanl' f (0, 0) $ sizes
|
||||
|
||||
maxBound' :: Integer
|
||||
maxBound' = fromIntegral (maxBound :: a)
|
||||
|
|
@ -62,14 +69,14 @@ twoByteMarkov xs =
|
|||
. M.fromListWith (M.unionWith (+))
|
||||
. zip xs
|
||||
. map (`M.singleton` (1 :: Integer))
|
||||
. tail
|
||||
. US.tail
|
||||
$ xs
|
||||
where
|
||||
-- optimization, changes fractions so they represent the fraction
|
||||
-- of remaining space in the list taken up instead of total space
|
||||
sizeAsFraction :: M.Map k Integer -> [(k, b)]
|
||||
sizeAsFraction m =
|
||||
zip keys
|
||||
zip keys
|
||||
. map discretizeFraction
|
||||
. fractionOfRemainingSums
|
||||
$ counts
|
||||
|
|
@ -88,34 +95,48 @@ twoByteMarkov xs =
|
|||
maxBound' :: Ratio Integer
|
||||
maxBound' = fromIntegral (maxBound :: b)
|
||||
|
||||
data Compressed a = Compressed
|
||||
{ markovs :: M.Map a [(a, Word8)],
|
||||
type Word4MapSerialized a = M.Map a ([a], CompressWord4List)
|
||||
type Word4Map a = M.Map a [(a, Word4)]
|
||||
|
||||
data Compressed a mapImplementation = Compressed
|
||||
{ markovs :: mapImplementation,
|
||||
chunks :: [Chunk a]
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic) -- , C.Serialize, P.NFData)
|
||||
|
||||
|
||||
deriving instance (Ord a, C.Serialize a) => C.Serialize (Compressed a (Word4MapSerialized a))
|
||||
|
||||
data Chunk a = Chunk
|
||||
{ location :: Ratio Integer,
|
||||
start :: a,
|
||||
length :: Int
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
|
||||
|
||||
data Chunk a = Chunk {
|
||||
location :: Ratio Integer,
|
||||
start :: a,
|
||||
length :: Int
|
||||
} deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData)
|
||||
unserealizeWord4Map :: Word4MapSerialized a -> Word4Map a
|
||||
unserealizeWord4Map = M.map (uncurry zip . Bi.second ((++ [maxBound]) . fromCompressedWord4List))
|
||||
|
||||
serealizeWord4Map :: Word4Map a -> Word4MapSerialized a
|
||||
serealizeWord4Map = M.map (Bi.second (toCompressedWord4List . L.init) . L.unzip)
|
||||
|
||||
decompress ::
|
||||
forall a.
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
|
||||
Compressed a ->
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||
Compressed a (Word4MapSerialized a) ->
|
||||
[a]
|
||||
decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks
|
||||
where
|
||||
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs'
|
||||
|
||||
rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs
|
||||
markovs' = unserealizeWord4Map markovs
|
||||
|
||||
toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size))
|
||||
|
||||
decompressChunk :: Chunk a -> [a]
|
||||
decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start)
|
||||
where
|
||||
decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
||||
-- decompress' :: (Ratio Integer, a) -> (Ratio Integer, a)
|
||||
decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal)
|
||||
where
|
||||
ansLoc :: Ratio Integer
|
||||
|
|
@ -126,6 +147,7 @@ decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ ch
|
|||
sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8]
|
||||
sanityCheck = map (sum . M.elems) . M.elems
|
||||
|
||||
chunk :: Int -> [a] -> [[a]]
|
||||
chunk chunkSize = chunk'
|
||||
where
|
||||
chunk' [] = []
|
||||
|
|
@ -133,30 +155,30 @@ chunk chunkSize = chunk'
|
|||
where
|
||||
(xs', xs'') = splitAt chunkSize xs
|
||||
|
||||
chunkLength :: Int
|
||||
chunkLength = 8000
|
||||
|
||||
compress ::
|
||||
forall a.
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) =>
|
||||
[a] ->
|
||||
Compressed a
|
||||
compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressChunk $ unCompressedChunks
|
||||
-- compress ::
|
||||
-- forall a.
|
||||
-- (Integral a, B.FiniteBits a, B.Bits a, Show a, NFData a) =>
|
||||
-- [a] ->
|
||||
-- Compressed a (Word4MapSerialized a)
|
||||
compress toCompress = traceShow ((fromIntegral :: Int -> Double) (BS.length . C.encode $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)])) / (fromIntegral . BS.length . C.encode . serealizeWord4Map $ (twoByteMarkovs :: M.Map Word8 [(Word8, Word4)]))) $ Compressed (serealizeWord4Map twoByteMarkovs) . P.runPar . P.parMap compressChunk $ unCompressedChunks
|
||||
where
|
||||
|
||||
unCompressedChunks = chunk chunkLength toCompress
|
||||
|
||||
twoByteMarkovs = twoByteMarkov toCompress
|
||||
|
||||
rings = M.map toRing twoByteMarkovs
|
||||
|
||||
compressChunk toCompress = Chunk {
|
||||
location = shortestLocation endStats,
|
||||
start = head toCompress,
|
||||
length = L.length toCompress
|
||||
}
|
||||
compressChunk toCompress =
|
||||
Chunk
|
||||
{ location = shortestLocation endStats,
|
||||
start = US.head toCompress,
|
||||
length = L.length toCompress
|
||||
}
|
||||
where
|
||||
|
||||
pairs = zip toCompress . tail $ toCompress
|
||||
pairs = zip toCompress . US.tail $ toCompress
|
||||
|
||||
shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size)
|
||||
|
||||
|
|
@ -172,7 +194,6 @@ compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressCh
|
|||
size = prevSize * nextSize
|
||||
}
|
||||
|
||||
|
||||
pyramidFold :: (a -> a -> a) -> [a] -> a
|
||||
pyramidFold f = pyramid
|
||||
where
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
module Compress.Huffman where
|
||||
|
||||
--- TODO: ?? accelarate
|
||||
|
||||
|
||||
import Data.Bifunctor qualified as Bi
|
||||
import Data.FiniteBit
|
||||
|
|
@ -12,9 +14,9 @@ import Data.Data qualified as D
|
|||
import Data.Foldable qualified as F
|
||||
import Data.IntMap.Strict qualified as IM
|
||||
import qualified Data.Proxy as D
|
||||
import qualified Basement.From as F
|
||||
import qualified Witch as F
|
||||
import Data.Map.Strict qualified as M
|
||||
import Basement.Bits as B
|
||||
import Data.Bits as B
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Maybe qualified as My
|
||||
import Data.PQueue qualified as PQ
|
||||
|
|
@ -31,7 +33,7 @@ import Data.HuffmanTree
|
|||
|
||||
decompress
|
||||
:: forall a
|
||||
. (Ord a, Integral a, B.FiniteBitsOps a)
|
||||
. (Ord a, Integral a, B.FiniteBits a)
|
||||
=> (TreeDirs, HuffmanTree a)
|
||||
-> Maybe BS.ByteString
|
||||
decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompress' treeDirs
|
||||
|
|
@ -50,7 +52,7 @@ decompress (TreeDirs treeDirs, tree) = BS.concat . map toByteString <$> decompre
|
|||
|
||||
compress
|
||||
:: forall a
|
||||
. (Ord a, Integral a, B.FiniteBitsOps a, B.BitOps a)
|
||||
. (Ord a, Integral a, B.FiniteBits a, B.Bits a)
|
||||
=> BS.ByteString
|
||||
-> Maybe (TreeDirs, HuffmanTree a)
|
||||
compress bs =
|
||||
|
|
@ -112,7 +114,7 @@ divideByteString n bs = x : divideByteString n xs
|
|||
|
||||
compressionRatioFor
|
||||
:: forall a
|
||||
. (Integral a, B.FiniteBitsOps a, B.BitOps a, Ord a, C.Serialize a)
|
||||
. (Integral a, B.FiniteBits a, B.Bits a, Ord a, C.Serialize a)
|
||||
=> D.Proxy a
|
||||
-> BS.ByteString
|
||||
-> Double
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
module Compress.PrefixTree where
|
||||
|
||||
import Basement.Bits qualified as B
|
||||
import qualified Basement.From as F
|
||||
import Data.Bits qualified as B
|
||||
import qualified Witch as F
|
||||
import Compress.Huffman qualified as H
|
||||
import Control.Applicative qualified as A
|
||||
import Data.Bifunctor qualified as Bi
|
||||
|
|
@ -16,10 +16,10 @@ import Data.Ord qualified as O
|
|||
import Data.PQueue qualified as PQ
|
||||
import Debug.Trace qualified as D
|
||||
import Debug.Trace qualified as T
|
||||
import Basement.Bits (FiniteBitsOps(numberOfBits))
|
||||
import GHC.Generics
|
||||
import qualified Data.Serialize as C
|
||||
import Data.FiniteBit
|
||||
import qualified Data.FiniteBit as B
|
||||
|
||||
data Tree a = (Ord a) =>
|
||||
Tree
|
||||
|
|
@ -32,7 +32,7 @@ newtype HuffmanPrefixTree a b = HuffmanPrefixTree
|
|||
|
||||
finiteBitTupleUncons ::
|
||||
forall a b.
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
||||
BS.ByteString ->
|
||||
Maybe ((a, b), BS.ByteString)
|
||||
finiteBitTupleUncons bs = case finiteBitUncons bs of
|
||||
|
|
@ -43,7 +43,7 @@ finiteBitTupleUncons bs = case finiteBitUncons bs of
|
|||
|
||||
fromByteString ::
|
||||
forall a b.
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
||||
BS.ByteString ->
|
||||
[(a, b)]
|
||||
fromByteString bs = case finiteBitTupleUncons bs of
|
||||
|
|
@ -52,7 +52,7 @@ fromByteString bs = case finiteBitTupleUncons bs of
|
|||
|
||||
toHuffmanTree ::
|
||||
forall a b.
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
||||
M.Map (a, b) Word ->
|
||||
HuffmanPrefixTree a b
|
||||
toHuffmanTree =
|
||||
|
|
@ -64,7 +64,7 @@ toHuffmanTree =
|
|||
|
||||
decompress ::
|
||||
forall a .
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a) =>
|
||||
(Integral a, B.FiniteBits a, B.Bits a) =>
|
||||
(TreeDirs, HuffmanPrefixTree a a, a)
|
||||
-> Maybe BS.ByteString
|
||||
decompress (TreeDirs treeDirs'', HuffmanPrefixTree prefixTree, initial') = BS.concat . map toByteString . (initial' :) <$> decompress' treeDirs'' initial'
|
||||
|
|
@ -77,7 +77,7 @@ decompress (TreeDirs treeDirs'', HuffmanPrefixTree prefixTree, initial') = BS.co
|
|||
|
||||
compress ::
|
||||
forall a b.
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
||||
BS.ByteString -> Maybe (TreeDirs, HuffmanPrefixTree a b, a)
|
||||
compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> initial
|
||||
where
|
||||
|
|
@ -129,7 +129,7 @@ compress bs = (TreeDirs $ concatMap treeDirsFor asFiniteBitPairs, tree, ) <$> in
|
|||
|
||||
nGramCounts ::
|
||||
forall a b.
|
||||
(Integral a, B.FiniteBitsOps a, B.BitOps a, Integral b, B.FiniteBitsOps b, B.BitOps b) =>
|
||||
(Integral a, B.FiniteBits a, B.Bits a, Integral b, B.FiniteBits b, B.Bits b) =>
|
||||
BS.ByteString ->
|
||||
M.Map (a, b) Word
|
||||
nGramCounts =
|
||||
|
|
@ -140,20 +140,20 @@ nGramCounts =
|
|||
. map (BS.take len)
|
||||
. BS.tails
|
||||
where
|
||||
len = (`div` 8) . F.from $ numberOfBits (0 :: a) + numberOfBits (0 :: b)
|
||||
len = (`div` 8) . F.from $ B.finiteBitSize (0 :: a) + B.finiteBitSize (0 :: b)
|
||||
|
||||
empty :: (Ord a) => Tree a
|
||||
empty = Tree M.empty
|
||||
|
||||
singleton :: (Ord a) => a -> Tree a
|
||||
singleton x = Tree $ M.singleton x empty
|
||||
singleton x = Tree $ M.singleton x Compress.PrefixTree.empty
|
||||
|
||||
fromSingleList :: (Ord a) => [a] -> Tree a
|
||||
fromSingleList [] = empty
|
||||
fromSingleList [] = Compress.PrefixTree.empty
|
||||
fromSingleList (x : xs) = Tree . M.singleton x . fromSingleList $ xs
|
||||
|
||||
fromList :: (Ord a) => [[a]] -> Tree a
|
||||
fromList = F.foldl' merge empty . map fromSingleList
|
||||
fromList = F.foldl' merge Compress.PrefixTree.empty . map fromSingleList
|
||||
|
||||
-- insert :: Ord a => Tree a -> [a] -> Tree a
|
||||
-- insert (Tree {..}) (x:xs) =
|
||||
|
|
|
|||
|
|
@ -1,40 +1,41 @@
|
|||
module Data.FiniteBit where
|
||||
|
||||
import Data.Bit (cloneToByteString)
|
||||
import Data.Bit qualified as B
|
||||
import Data.Bit qualified as BV
|
||||
import Data.Bits qualified as B
|
||||
import Data.Bits qualified as BV
|
||||
import Data.ByteString qualified as BS
|
||||
import qualified Data.Proxy as D
|
||||
import qualified Basement.From as F
|
||||
import qualified Witch as F
|
||||
import Data.Foldable qualified as F
|
||||
import Data.Word
|
||||
import Basement.Bits as B
|
||||
import Data.Proxy (Proxy)
|
||||
|
||||
class SizedBits a where
|
||||
size :: Proxy a -> Int
|
||||
|
||||
numBytesIn :: forall a. (B.FiniteBitsOps a, Integral a) => D.Proxy a -> Int
|
||||
numBytesIn _ = (`div` 8) . F.from . B.numberOfBits $ (0 :: a)
|
||||
numBytesIn :: forall a. (B.FiniteBits a, Integral a) => D.Proxy a -> Int
|
||||
numBytesIn _ = (`div` 8) . F.from . B.finiteBitSize $ (0 :: a)
|
||||
|
||||
|
||||
toWordsList :: forall a. (Integral a, B.FiniteBitsOps a, B.BitOps a) => BS.ByteString -> [a]
|
||||
toWordsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a]
|
||||
toWordsList bs = case finiteBitUncons bs of
|
||||
Nothing -> []
|
||||
(Just (x, xs)) -> x : toWordsList xs
|
||||
|
||||
toByteString :: forall a. (Integral a, B.FiniteBitsOps a) => a -> BS.ByteString
|
||||
toByteString :: forall a. (Integral a, B.FiniteBits a) => a -> BS.ByteString
|
||||
toByteString n = BS.pack . take numBytes . map (fromIntegral . (n `B.rotateL`)) $ [8, 16 ..]
|
||||
where
|
||||
numBytes = numBytesIn (D.Proxy :: D.Proxy a)
|
||||
|
||||
finiteBitUncons
|
||||
:: forall a
|
||||
. (Integral a, B.FiniteBitsOps a, B.BitOps a)
|
||||
. (Integral a, B.FiniteBits a)
|
||||
=> BS.ByteString
|
||||
-> Maybe (a, BS.ByteString)
|
||||
finiteBitUncons [] = Nothing
|
||||
finiteBitUncons bs =
|
||||
Just
|
||||
. (,rest)
|
||||
. F.foldl' (.|.) 0
|
||||
. F.foldl' (B..|.) 0
|
||||
. zipWith (flip B.rotateR) [8, 16 ..]
|
||||
. map (fromIntegral :: Word8 -> a)
|
||||
. BS.unpack
|
||||
|
|
|
|||
80
src/Data/Word4.hs
Normal file
80
src/Data/Word4.hs
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
module Data.Word4 where
|
||||
|
||||
import Data.Bits
|
||||
import Data.Word
|
||||
import qualified Data.Serialize as C
|
||||
import qualified Data.Map as M
|
||||
|
||||
-- only use lower 4 bits
|
||||
newtype Word4 = Word4 {
|
||||
inner :: Word8
|
||||
} deriving (Eq, Ord, Show, Enum, Generic, C.Serialize)
|
||||
|
||||
map2 :: (Word8 -> Word8 -> Word8) -> Word4 -> Word4 -> Word4
|
||||
map2 f (Word4 a) (Word4 b) = Word4 $ f a b
|
||||
|
||||
data CompressWord4List = CompressWord4List
|
||||
{ xs :: [Word8],
|
||||
last :: Maybe Word8 -- in lower 4 bits
|
||||
} deriving (Eq, Ord, Generic, Show, C.Serialize, NFData)
|
||||
|
||||
bitwiseAnd a b = getAnd $ And a <> And b
|
||||
|
||||
bitwiseOr a b = getIor $ Ior a <> Ior b
|
||||
|
||||
toCompressedWord4List [] = CompressWord4List [] Nothing
|
||||
toCompressedWord4List [Word4 x] = CompressWord4List [] (Just x)
|
||||
toCompressedWord4List ((Word4 x) : (Word4 x') : xs) =
|
||||
CompressWord4List
|
||||
{ xs = headByte : xs',
|
||||
last = last'
|
||||
}
|
||||
where
|
||||
headByte = bitwiseOr (bitwiseAnd 0xf0 (x .<<. 4)) (bitwiseAnd 0x0f x')
|
||||
|
||||
(CompressWord4List xs' last') = toCompressedWord4List xs
|
||||
|
||||
fromCompressedWord4List :: CompressWord4List -> [Word4]
|
||||
fromCompressedWord4List (CompressWord4List [] Nothing) = []
|
||||
fromCompressedWord4List (CompressWord4List [] (Just a)) = [word4 a]
|
||||
fromCompressedWord4List (CompressWord4List {xs = (x : xs), ..}) =
|
||||
Word4 ((bitwiseAnd 0xf0 x) .>>. 4)
|
||||
: word4 x
|
||||
: fromCompressedWord4List (CompressWord4List xs last)
|
||||
|
||||
-- instance Show Word4 where
|
||||
-- show :: Word4 -> String
|
||||
-- show (Word4 a) = show . (.>>. 4) $ a
|
||||
|
||||
instance Bounded Word4 where
|
||||
minBound = word4 0
|
||||
maxBound = word4 0xf
|
||||
|
||||
instance Real Word4 where
|
||||
toRational = toRational . inner
|
||||
|
||||
instance Integral Word4 where
|
||||
quot = map2 quot `on` clean
|
||||
rem = map2 rem `on` clean
|
||||
div = map2 div `on` clean
|
||||
mod = map2 mod `on` clean
|
||||
quotRem :: Word4 -> Word4 -> (Word4, Word4)
|
||||
quotRem (Word4 a) (Word4 b) = (word4 a', word4 b')
|
||||
where
|
||||
(a', b') = quotRem a b
|
||||
|
||||
divMod (Word4 a) (Word4 b) = (word4 a', word4 b')
|
||||
where
|
||||
(a', b') = divMod a b
|
||||
toInteger = toInteger . inner . clean
|
||||
|
||||
clean (Word4 a) = Word4 . bitwiseAnd 0xf $ a
|
||||
word4 = clean . Word4
|
||||
|
||||
instance Num Word4 where
|
||||
(+) = map2 (+) `on` clean
|
||||
(*) = map2 (*) `on` clean
|
||||
abs = id
|
||||
negate = id
|
||||
signum = const 1
|
||||
fromInteger = clean . Word4 . fromInteger
|
||||
|
|
@ -1,6 +1,5 @@
|
|||
module Main where
|
||||
|
||||
import Basement.Bits qualified as B
|
||||
import Compress.Huffman
|
||||
import Compress.Huffman qualified as HT
|
||||
import Compress.PrefixTree (HuffmanPrefixTree (HuffmanPrefixTree))
|
||||
|
|
@ -31,7 +30,7 @@ data CLIOpts = CLIOpts
|
|||
|
||||
applyCompressionOptions ::
|
||||
forall a.
|
||||
(Integral a, B.BitOps a, B.FiniteBitsOps a, Ord a, C.Serialize a) =>
|
||||
(Integral a, B.Bits a, B.FiniteBits a, Ord a, C.Serialize a) =>
|
||||
P.Proxy a ->
|
||||
CLIOpts ->
|
||||
BS.ByteString ->
|
||||
|
|
@ -53,7 +52,7 @@ main = do
|
|||
-- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy"
|
||||
f <- BS.getContents
|
||||
let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f
|
||||
let compressedUnencoded :: A.Compressed Word8 = (A.compress fAsWords)
|
||||
let compressedUnencoded :: (A.Compressed Word8 (A.Word4MapSerialized Word8)) = (A.compress fAsWords)
|
||||
let compressed = C.encode compressedUnencoded
|
||||
print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f))
|
||||
print ("works?", (A.decompress compressedUnencoded) == fAsWords)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue