compression works
Word16 seems to give the best compression ratio. It takes a very long time to run on >5MB files. Decompression is not implemented yet, but I'm confident it works.
This commit is contained in:
commit
874d0b161d
7 changed files with 420 additions and 0 deletions
1
.envrc
Normal file
1
.envrc
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
use flake;
|
||||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
/dist-newstyle/
|
||||||
|
/result
|
||||||
|
/.direnv/
|
||||||
|
**/.DS_Store
|
||||||
53
compress.cabal
Normal file
53
compress.cabal
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
cabal-version: 3.0
|
||||||
|
name: compress
|
||||||
|
version: 0.1.0.0
|
||||||
|
category: Web
|
||||||
|
build-type: Simple
|
||||||
|
|
||||||
|
executable compress
|
||||||
|
main-is:
|
||||||
|
Main.hs
|
||||||
|
default-extensions:
|
||||||
|
DataKinds,
|
||||||
|
DeriveAnyClass,
|
||||||
|
DeriveGeneric,
|
||||||
|
DuplicateRecordFields,
|
||||||
|
ExtendedDefaultRules,
|
||||||
|
FlexibleContexts,
|
||||||
|
FlexibleInstances,
|
||||||
|
ImpredicativeTypes,
|
||||||
|
InstanceSigs,
|
||||||
|
MultiParamTypeClasses,
|
||||||
|
NamedFieldPuns,
|
||||||
|
OverloadedLabels,
|
||||||
|
OverloadedLists,
|
||||||
|
OverloadedStrings,
|
||||||
|
PartialTypeSignatures,
|
||||||
|
RankNTypes,
|
||||||
|
RecordWildCards,
|
||||||
|
RecursiveDo,
|
||||||
|
ScopedTypeVariables,
|
||||||
|
StandaloneDeriving,
|
||||||
|
StrictData,
|
||||||
|
TemplateHaskell,
|
||||||
|
TupleSections,
|
||||||
|
TypeApplications,
|
||||||
|
TypeFamilies,
|
||||||
|
TypeOperators,
|
||||||
|
TypeSynonymInstances,
|
||||||
|
UndecidableInstances,
|
||||||
|
hs-source-dirs:
|
||||||
|
src
|
||||||
|
build-depends:
|
||||||
|
base,
|
||||||
|
bitvec,
|
||||||
|
bytestring,
|
||||||
|
cereal,
|
||||||
|
containers,
|
||||||
|
text,
|
||||||
|
uuid,
|
||||||
|
vector
|
||||||
|
default-language:
|
||||||
|
GHC2021
|
||||||
|
other-modules:
|
||||||
|
Data.SortedList
|
||||||
117
flake.lock
generated
Normal file
117
flake.lock
generated
Normal file
|
|
@ -0,0 +1,117 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"flake-parts": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1709336216,
|
||||||
|
"narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
|
||||||
|
"owner": "hercules-ci",
|
||||||
|
"repo": "flake-parts",
|
||||||
|
"rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "hercules-ci",
|
||||||
|
"repo": "flake-parts",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"haskell-flake": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1711149116,
|
||||||
|
"narHash": "sha256-tccTtjRvxrhSJkCnmNwaPrq0DDM3UsM0uiDyW4uJXXc=",
|
||||||
|
"owner": "srid",
|
||||||
|
"repo": "haskell-flake",
|
||||||
|
"rev": "6ae8a85071adfe08d70d9963c526947403c6c070",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "srid",
|
||||||
|
"repo": "haskell-flake",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1711163522,
|
||||||
|
"narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=",
|
||||||
|
"owner": "nixos",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nixos",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs-lib": {
|
||||||
|
"locked": {
|
||||||
|
"dir": "lib",
|
||||||
|
"lastModified": 1709237383,
|
||||||
|
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"dir": "lib",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"flake-parts": "flake-parts",
|
||||||
|
"haskell-flake": "haskell-flake",
|
||||||
|
"nixpkgs": "nixpkgs",
|
||||||
|
"systems": "systems",
|
||||||
|
"treefmt-nix": "treefmt-nix"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"systems": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1681028828,
|
||||||
|
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nix-systems",
|
||||||
|
"repo": "default",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"treefmt-nix": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1710781103,
|
||||||
|
"narHash": "sha256-nehQK/XTFxfa6rYKtbi8M1w+IU1v5twYhiyA4dg1vpg=",
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "treefmt-nix",
|
||||||
|
"rev": "7ee5aaac63c30d3c97a8c56efe89f3b2aa9ae564",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "numtide",
|
||||||
|
"repo": "treefmt-nix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
84
flake.nix
Normal file
84
flake.nix
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
{
|
||||||
|
description = "srid/haskell-template: Nix template for Haskell projects";
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||||
|
systems.url = "github:nix-systems/default";
|
||||||
|
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||||
|
haskell-flake.url = "github:srid/haskell-flake";
|
||||||
|
treefmt-nix.url = "github:numtide/treefmt-nix";
|
||||||
|
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs = inputs:
|
||||||
|
inputs.flake-parts.lib.mkFlake { inherit inputs; } {
|
||||||
|
systems = import inputs.systems;
|
||||||
|
imports = [
|
||||||
|
inputs.haskell-flake.flakeModule
|
||||||
|
inputs.treefmt-nix.flakeModule
|
||||||
|
];
|
||||||
|
perSystem = { self', system, lib, config, pkgs, ... }: {
|
||||||
|
# Our only Haskell project. You can have multiple projects, but this template
|
||||||
|
# has only one.
|
||||||
|
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
|
||||||
|
haskellProjects.default = {
|
||||||
|
# The base package set (this value is the default)
|
||||||
|
# basePackages = pkgs.haskellPackages;
|
||||||
|
|
||||||
|
# Packages to add on top of `basePackages`
|
||||||
|
packages = {
|
||||||
|
# Add source or Hackage overrides here
|
||||||
|
# (Local packages are added automatically)
|
||||||
|
# https://github.com/lehins/hip.git
|
||||||
|
# hip.source = inputs.hip + "/hip";
|
||||||
|
};
|
||||||
|
|
||||||
|
# Add your package overrides here
|
||||||
|
settings = {
|
||||||
|
# barbies-th = {
|
||||||
|
# broken = false;
|
||||||
|
# jailbreak = true;
|
||||||
|
# };
|
||||||
|
};
|
||||||
|
|
||||||
|
# Development shell configuration
|
||||||
|
devShell = {
|
||||||
|
hlsCheck.enable = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
# What should haskell-flake add to flake outputs?
|
||||||
|
autoWire = [ "packages" "apps" "checks" ]; # Wire all but the devShell
|
||||||
|
};
|
||||||
|
|
||||||
|
# Auto formatters. This also adds a flake check to ensure that the
|
||||||
|
# source tree was auto formatted.
|
||||||
|
treefmt.config = {
|
||||||
|
projectRootFile = "flake.nix";
|
||||||
|
|
||||||
|
programs.ormolu.enable = true;
|
||||||
|
programs.nixpkgs-fmt.enable = true;
|
||||||
|
programs.cabal-fmt.enable = true;
|
||||||
|
programs.hlint.enable = true;
|
||||||
|
|
||||||
|
# We use fourmolu
|
||||||
|
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
|
||||||
|
};
|
||||||
|
|
||||||
|
# Default package & app.
|
||||||
|
packages.default = self'.packages.compress;
|
||||||
|
apps.default = self'.apps.compress;
|
||||||
|
|
||||||
|
# Default shell.
|
||||||
|
devShells.default = pkgs.mkShell {
|
||||||
|
name = "haskell-template";
|
||||||
|
meta.description = "Haskell development environment";
|
||||||
|
# See https://zero-to-flakes.com/haskell-flake/devshell#composing-devshells
|
||||||
|
inputsFrom = [
|
||||||
|
config.haskellProjects.default.outputs.devShell
|
||||||
|
config.treefmt.build.devShell
|
||||||
|
];
|
||||||
|
nativeBuildInputs = with pkgs; [
|
||||||
|
];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
27
src/Data/SortedList.hs
Normal file
27
src/Data/SortedList.hs
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
module Data.SortedList where
|
||||||
|
import qualified Data.List as L
|
||||||
|
|
||||||
|
data SortedList a = Ord a => SortedList {
|
||||||
|
toList :: [a]
|
||||||
|
}
|
||||||
|
|
||||||
|
deriving instance Show a => Show (SortedList a)
|
||||||
|
|
||||||
|
map :: Ord a1 => ([a2] -> [a1]) -> SortedList a2 -> SortedList a1
|
||||||
|
map f (SortedList a) = SortedList . f $ a
|
||||||
|
|
||||||
|
fromList :: Ord a => [a] -> SortedList a
|
||||||
|
fromList = SortedList . L.sort
|
||||||
|
|
||||||
|
singleton :: Ord a => a -> SortedList a
|
||||||
|
singleton a = SortedList [a]
|
||||||
|
|
||||||
|
minView :: SortedList a -> Maybe (a, SortedList a)
|
||||||
|
minView (SortedList (x:xs)) = Just (x, SortedList xs)
|
||||||
|
minView (SortedList _) = Nothing
|
||||||
|
|
||||||
|
insert :: p -> SortedList p -> SortedList p
|
||||||
|
insert x (SortedList []) = singleton x
|
||||||
|
insert x (SortedList (x' : xs))
|
||||||
|
| x <= x' = SortedList (x : x' : xs)
|
||||||
|
| otherwise = Data.SortedList.map (x' :) . insert x . SortedList $ xs
|
||||||
134
src/Main.hs
Normal file
134
src/Main.hs
Normal file
|
|
@ -0,0 +1,134 @@
|
||||||
|
module Main where
|
||||||
|
|
||||||
|
import Data.Bifunctor qualified as Bi
|
||||||
|
import Data.Bit qualified as B
|
||||||
|
import Data.Bit qualified as BV
|
||||||
|
import Data.Bits (Bits ((.|.)))
|
||||||
|
import Data.Bits qualified as B
|
||||||
|
import Data.ByteString (fromFilePath)
|
||||||
|
import Data.ByteString qualified as BS
|
||||||
|
import Data.Foldable qualified as F
|
||||||
|
import Data.IntMap.Strict qualified as IM
|
||||||
|
import Data.Map.Strict qualified as M
|
||||||
|
import Data.Maybe (fromMaybe)
|
||||||
|
import Data.Maybe qualified as My
|
||||||
|
import Data.Serialize qualified as C
|
||||||
|
import Data.SortedList qualified as SL
|
||||||
|
import Data.Text qualified as T
|
||||||
|
import Data.Text.IO qualified as TIO
|
||||||
|
import Data.Vector.Unboxed qualified as V
|
||||||
|
import Data.Word
|
||||||
|
import GHC.Generics (Generic)
|
||||||
|
import System.Environment qualified as SE
|
||||||
|
|
||||||
|
data HuffmanTree a
|
||||||
|
= Leaf a
|
||||||
|
| Node
|
||||||
|
{ left :: HuffmanTree a
|
||||||
|
, right :: HuffmanTree a
|
||||||
|
}
|
||||||
|
deriving (Eq, Ord, Show, Generic, C.Serialize)
|
||||||
|
|
||||||
|
data SizedHuffmanTree a = SizedHuffmanTree
|
||||||
|
{ size :: Int
|
||||||
|
, tree :: HuffmanTree a
|
||||||
|
}
|
||||||
|
deriving (Eq, Ord, Show)
|
||||||
|
|
||||||
|
data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize)
|
||||||
|
|
||||||
|
findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir]
|
||||||
|
findTreeDirections (Leaf a) = M.singleton a []
|
||||||
|
findTreeDirections (Node{..}) = M.union (rec' L left) (rec' R right)
|
||||||
|
where
|
||||||
|
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
|
||||||
|
rec' dir = M.map (dir :) . findTreeDirections
|
||||||
|
|
||||||
|
mergeSizedTrees tree0 tree1 =
|
||||||
|
SizedHuffmanTree
|
||||||
|
{ size = (size tree0) + (size tree1)
|
||||||
|
, tree = Node (tree tree0) (tree tree1)
|
||||||
|
}
|
||||||
|
|
||||||
|
-- compress :: BS.ByteString -> Maybe (SizedHuffmanTree BS.ByteString)
|
||||||
|
compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)
|
||||||
|
compress bs =
|
||||||
|
liftA2 (,) treeDirections unsizedTree
|
||||||
|
where
|
||||||
|
unsizedTree :: Maybe (HuffmanTree a)
|
||||||
|
unsizedTree = tree <$> mergedHuffmanTrees
|
||||||
|
|
||||||
|
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
|
||||||
|
|
||||||
|
mergedHuffmanTrees =
|
||||||
|
mergeHuffmanTrees
|
||||||
|
. SL.fromList
|
||||||
|
. map (uncurry (flip SizedHuffmanTree) . Bi.first Leaf)
|
||||||
|
. counts
|
||||||
|
$ dividedByteString
|
||||||
|
|
||||||
|
treeDirMap :: M.Map a [TreeDir]
|
||||||
|
treeDirMap = My.maybe M.empty (findTreeDirections . tree) mergedHuffmanTrees
|
||||||
|
|
||||||
|
dividedByteString = toBitsList bs
|
||||||
|
|
||||||
|
encodeCompressed :: C.Serialize a => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString
|
||||||
|
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
|
||||||
|
where
|
||||||
|
encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R))
|
||||||
|
|
||||||
|
mergeHuffmanTrees :: SL.SortedList (SizedHuffmanTree a) -> Maybe (SizedHuffmanTree a)
|
||||||
|
mergeHuffmanTrees (SL.SortedList (tree0 : tree1 : xs)) =
|
||||||
|
mergeHuffmanTrees
|
||||||
|
. SL.insert (mergeSizedTrees tree0 tree1)
|
||||||
|
. SL.SortedList
|
||||||
|
$ xs
|
||||||
|
|
||||||
|
mergeHuffmanTrees (SL.SortedList [x]) = Just x
|
||||||
|
mergeHuffmanTrees (SL.SortedList []) = Nothing
|
||||||
|
|
||||||
|
counts :: (Ord a) => [a] -> [(a, Int)]
|
||||||
|
counts = M.toList . F.foldl' combiningInsert M.empty
|
||||||
|
where
|
||||||
|
combiningInsert m key = M.insertWith (+) key 1 m
|
||||||
|
|
||||||
|
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
|
||||||
|
divideByteString n [] = []
|
||||||
|
divideByteString n bs = x : divideByteString n xs
|
||||||
|
where
|
||||||
|
(x, xs) = BS.splitAt n bs
|
||||||
|
|
||||||
|
toBitsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a]
|
||||||
|
toBitsList bs = case finiteBitUncons bs of
|
||||||
|
Nothing -> []
|
||||||
|
(Just (x, xs)) -> x : (toBitsList xs)
|
||||||
|
|
||||||
|
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
|
||||||
|
finiteBitUncons [] = Nothing
|
||||||
|
finiteBitUncons bs =
|
||||||
|
Just
|
||||||
|
. (,rest)
|
||||||
|
. F.foldl' (.|.) 0
|
||||||
|
. zipWith (flip B.shiftL) [0, 8 ..]
|
||||||
|
. reverse
|
||||||
|
. map (fromIntegral :: Word8 -> a)
|
||||||
|
. BS.unpack
|
||||||
|
$ takenBytes
|
||||||
|
where
|
||||||
|
numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
|
||||||
|
|
||||||
|
takenBytes :: BS.ByteString
|
||||||
|
rest :: BS.ByteString
|
||||||
|
(takenBytes, rest) = BS.splitAt numBytes bs
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
[filePath] <- SE.getArgs
|
||||||
|
f <- BS.readFile filePath
|
||||||
|
TIO.putStrLn "original:"
|
||||||
|
print . BS.length $ f
|
||||||
|
TIO.putStrLn "rest:"
|
||||||
|
print . ("Word64",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word64)) $ f
|
||||||
|
print . ("Word32",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word32)) $ f
|
||||||
|
print . ("Word16",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word16)) $ f
|
||||||
|
print . ("Word8",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f
|
||||||
Loading…
Add table
Add a link
Reference in a new issue