compression works
Word16 seems to give the best compression ratio. It takes a very long time to run on >5MB files. Decompression is not implemented yet, but I'm confident it works.
This commit is contained in:
commit
874d0b161d
7 changed files with 420 additions and 0 deletions
1
.envrc
Normal file
1
.envrc
Normal file
|
|
@ -0,0 +1 @@
|
|||
use flake;
|
||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
/dist-newstyle/
|
||||
/result
|
||||
/.direnv/
|
||||
**/.DS_Store
|
||||
53
compress.cabal
Normal file
53
compress.cabal
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
cabal-version: 3.0
|
||||
name: compress
|
||||
version: 0.1.0.0
|
||||
category: Web
|
||||
build-type: Simple
|
||||
|
||||
executable compress
|
||||
main-is:
|
||||
Main.hs
|
||||
default-extensions:
|
||||
DataKinds,
|
||||
DeriveAnyClass,
|
||||
DeriveGeneric,
|
||||
DuplicateRecordFields,
|
||||
ExtendedDefaultRules,
|
||||
FlexibleContexts,
|
||||
FlexibleInstances,
|
||||
ImpredicativeTypes,
|
||||
InstanceSigs,
|
||||
MultiParamTypeClasses,
|
||||
NamedFieldPuns,
|
||||
OverloadedLabels,
|
||||
OverloadedLists,
|
||||
OverloadedStrings,
|
||||
PartialTypeSignatures,
|
||||
RankNTypes,
|
||||
RecordWildCards,
|
||||
RecursiveDo,
|
||||
ScopedTypeVariables,
|
||||
StandaloneDeriving,
|
||||
StrictData,
|
||||
TemplateHaskell,
|
||||
TupleSections,
|
||||
TypeApplications,
|
||||
TypeFamilies,
|
||||
TypeOperators,
|
||||
TypeSynonymInstances,
|
||||
UndecidableInstances,
|
||||
hs-source-dirs:
|
||||
src
|
||||
build-depends:
|
||||
base,
|
||||
bitvec,
|
||||
bytestring,
|
||||
cereal,
|
||||
containers,
|
||||
text,
|
||||
uuid,
|
||||
vector
|
||||
default-language:
|
||||
GHC2021
|
||||
other-modules:
|
||||
Data.SortedList
|
||||
117
flake.lock
generated
Normal file
117
flake.lock
generated
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
{
|
||||
"nodes": {
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1709336216,
|
||||
"narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"haskell-flake": {
|
||||
"locked": {
|
||||
"lastModified": 1711149116,
|
||||
"narHash": "sha256-tccTtjRvxrhSJkCnmNwaPrq0DDM3UsM0uiDyW4uJXXc=",
|
||||
"owner": "srid",
|
||||
"repo": "haskell-flake",
|
||||
"rev": "6ae8a85071adfe08d70d9963c526947403c6c070",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "srid",
|
||||
"repo": "haskell-flake",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1711163522,
|
||||
"narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"dir": "lib",
|
||||
"lastModified": 1709237383,
|
||||
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"dir": "lib",
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-parts": "flake-parts",
|
||||
"haskell-flake": "haskell-flake",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"systems": "systems",
|
||||
"treefmt-nix": "treefmt-nix"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"treefmt-nix": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1710781103,
|
||||
"narHash": "sha256-nehQK/XTFxfa6rYKtbi8M1w+IU1v5twYhiyA4dg1vpg=",
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"rev": "7ee5aaac63c30d3c97a8c56efe89f3b2aa9ae564",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "treefmt-nix",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
84
flake.nix
Normal file
84
flake.nix
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
{
|
||||
description = "srid/haskell-template: Nix template for Haskell projects";
|
||||
inputs = {
|
||||
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||
systems.url = "github:nix-systems/default";
|
||||
flake-parts.url = "github:hercules-ci/flake-parts";
|
||||
haskell-flake.url = "github:srid/haskell-flake";
|
||||
treefmt-nix.url = "github:numtide/treefmt-nix";
|
||||
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
outputs = inputs:
|
||||
inputs.flake-parts.lib.mkFlake { inherit inputs; } {
|
||||
systems = import inputs.systems;
|
||||
imports = [
|
||||
inputs.haskell-flake.flakeModule
|
||||
inputs.treefmt-nix.flakeModule
|
||||
];
|
||||
perSystem = { self', system, lib, config, pkgs, ... }: {
|
||||
# Our only Haskell project. You can have multiple projects, but this template
|
||||
# has only one.
|
||||
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
|
||||
haskellProjects.default = {
|
||||
# The base package set (this value is the default)
|
||||
# basePackages = pkgs.haskellPackages;
|
||||
|
||||
# Packages to add on top of `basePackages`
|
||||
packages = {
|
||||
# Add source or Hackage overrides here
|
||||
# (Local packages are added automatically)
|
||||
# https://github.com/lehins/hip.git
|
||||
# hip.source = inputs.hip + "/hip";
|
||||
};
|
||||
|
||||
# Add your package overrides here
|
||||
settings = {
|
||||
# barbies-th = {
|
||||
# broken = false;
|
||||
# jailbreak = true;
|
||||
# };
|
||||
};
|
||||
|
||||
# Development shell configuration
|
||||
devShell = {
|
||||
hlsCheck.enable = false;
|
||||
};
|
||||
|
||||
# What should haskell-flake add to flake outputs?
|
||||
autoWire = [ "packages" "apps" "checks" ]; # Wire all but the devShell
|
||||
};
|
||||
|
||||
# Auto formatters. This also adds a flake check to ensure that the
|
||||
# source tree was auto formatted.
|
||||
treefmt.config = {
|
||||
projectRootFile = "flake.nix";
|
||||
|
||||
programs.ormolu.enable = true;
|
||||
programs.nixpkgs-fmt.enable = true;
|
||||
programs.cabal-fmt.enable = true;
|
||||
programs.hlint.enable = true;
|
||||
|
||||
# We use fourmolu
|
||||
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
|
||||
};
|
||||
|
||||
# Default package & app.
|
||||
packages.default = self'.packages.compress;
|
||||
apps.default = self'.apps.compress;
|
||||
|
||||
# Default shell.
|
||||
devShells.default = pkgs.mkShell {
|
||||
name = "haskell-template";
|
||||
meta.description = "Haskell development environment";
|
||||
# See https://zero-to-flakes.com/haskell-flake/devshell#composing-devshells
|
||||
inputsFrom = [
|
||||
config.haskellProjects.default.outputs.devShell
|
||||
config.treefmt.build.devShell
|
||||
];
|
||||
nativeBuildInputs = with pkgs; [
|
||||
];
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
27
src/Data/SortedList.hs
Normal file
27
src/Data/SortedList.hs
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
module Data.SortedList where
|
||||
import qualified Data.List as L
|
||||
|
||||
data SortedList a = Ord a => SortedList {
|
||||
toList :: [a]
|
||||
}
|
||||
|
||||
deriving instance Show a => Show (SortedList a)
|
||||
|
||||
map :: Ord a1 => ([a2] -> [a1]) -> SortedList a2 -> SortedList a1
|
||||
map f (SortedList a) = SortedList . f $ a
|
||||
|
||||
fromList :: Ord a => [a] -> SortedList a
|
||||
fromList = SortedList . L.sort
|
||||
|
||||
singleton :: Ord a => a -> SortedList a
|
||||
singleton a = SortedList [a]
|
||||
|
||||
minView :: SortedList a -> Maybe (a, SortedList a)
|
||||
minView (SortedList (x:xs)) = Just (x, SortedList xs)
|
||||
minView (SortedList _) = Nothing
|
||||
|
||||
insert :: p -> SortedList p -> SortedList p
|
||||
insert x (SortedList []) = singleton x
|
||||
insert x (SortedList (x' : xs))
|
||||
| x <= x' = SortedList (x : x' : xs)
|
||||
| otherwise = Data.SortedList.map (x' :) . insert x . SortedList $ xs
|
||||
134
src/Main.hs
Normal file
134
src/Main.hs
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
module Main where
|
||||
|
||||
import Data.Bifunctor qualified as Bi
|
||||
import Data.Bit qualified as B
|
||||
import Data.Bit qualified as BV
|
||||
import Data.Bits (Bits ((.|.)))
|
||||
import Data.Bits qualified as B
|
||||
import Data.ByteString (fromFilePath)
|
||||
import Data.ByteString qualified as BS
|
||||
import Data.Foldable qualified as F
|
||||
import Data.IntMap.Strict qualified as IM
|
||||
import Data.Map.Strict qualified as M
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Maybe qualified as My
|
||||
import Data.Serialize qualified as C
|
||||
import Data.SortedList qualified as SL
|
||||
import Data.Text qualified as T
|
||||
import Data.Text.IO qualified as TIO
|
||||
import Data.Vector.Unboxed qualified as V
|
||||
import Data.Word
|
||||
import GHC.Generics (Generic)
|
||||
import System.Environment qualified as SE
|
||||
|
||||
data HuffmanTree a
|
||||
= Leaf a
|
||||
| Node
|
||||
{ left :: HuffmanTree a
|
||||
, right :: HuffmanTree a
|
||||
}
|
||||
deriving (Eq, Ord, Show, Generic, C.Serialize)
|
||||
|
||||
data SizedHuffmanTree a = SizedHuffmanTree
|
||||
{ size :: Int
|
||||
, tree :: HuffmanTree a
|
||||
}
|
||||
deriving (Eq, Ord, Show)
|
||||
|
||||
data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize)
|
||||
|
||||
findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir]
|
||||
findTreeDirections (Leaf a) = M.singleton a []
|
||||
findTreeDirections (Node{..}) = M.union (rec' L left) (rec' R right)
|
||||
where
|
||||
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
|
||||
rec' dir = M.map (dir :) . findTreeDirections
|
||||
|
||||
mergeSizedTrees tree0 tree1 =
|
||||
SizedHuffmanTree
|
||||
{ size = (size tree0) + (size tree1)
|
||||
, tree = Node (tree tree0) (tree tree1)
|
||||
}
|
||||
|
||||
-- compress :: BS.ByteString -> Maybe (SizedHuffmanTree BS.ByteString)
|
||||
compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)
|
||||
compress bs =
|
||||
liftA2 (,) treeDirections unsizedTree
|
||||
where
|
||||
unsizedTree :: Maybe (HuffmanTree a)
|
||||
unsizedTree = tree <$> mergedHuffmanTrees
|
||||
|
||||
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
|
||||
|
||||
mergedHuffmanTrees =
|
||||
mergeHuffmanTrees
|
||||
. SL.fromList
|
||||
. map (uncurry (flip SizedHuffmanTree) . Bi.first Leaf)
|
||||
. counts
|
||||
$ dividedByteString
|
||||
|
||||
treeDirMap :: M.Map a [TreeDir]
|
||||
treeDirMap = My.maybe M.empty (findTreeDirections . tree) mergedHuffmanTrees
|
||||
|
||||
dividedByteString = toBitsList bs
|
||||
|
||||
encodeCompressed :: C.Serialize a => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString
|
||||
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
|
||||
where
|
||||
encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R))
|
||||
|
||||
mergeHuffmanTrees :: SL.SortedList (SizedHuffmanTree a) -> Maybe (SizedHuffmanTree a)
|
||||
mergeHuffmanTrees (SL.SortedList (tree0 : tree1 : xs)) =
|
||||
mergeHuffmanTrees
|
||||
. SL.insert (mergeSizedTrees tree0 tree1)
|
||||
. SL.SortedList
|
||||
$ xs
|
||||
|
||||
mergeHuffmanTrees (SL.SortedList [x]) = Just x
|
||||
mergeHuffmanTrees (SL.SortedList []) = Nothing
|
||||
|
||||
counts :: (Ord a) => [a] -> [(a, Int)]
|
||||
counts = M.toList . F.foldl' combiningInsert M.empty
|
||||
where
|
||||
combiningInsert m key = M.insertWith (+) key 1 m
|
||||
|
||||
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
|
||||
divideByteString n [] = []
|
||||
divideByteString n bs = x : divideByteString n xs
|
||||
where
|
||||
(x, xs) = BS.splitAt n bs
|
||||
|
||||
toBitsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a]
|
||||
toBitsList bs = case finiteBitUncons bs of
|
||||
Nothing -> []
|
||||
(Just (x, xs)) -> x : (toBitsList xs)
|
||||
|
||||
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
|
||||
finiteBitUncons [] = Nothing
|
||||
finiteBitUncons bs =
|
||||
Just
|
||||
. (,rest)
|
||||
. F.foldl' (.|.) 0
|
||||
. zipWith (flip B.shiftL) [0, 8 ..]
|
||||
. reverse
|
||||
. map (fromIntegral :: Word8 -> a)
|
||||
. BS.unpack
|
||||
$ takenBytes
|
||||
where
|
||||
numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
|
||||
|
||||
takenBytes :: BS.ByteString
|
||||
rest :: BS.ByteString
|
||||
(takenBytes, rest) = BS.splitAt numBytes bs
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
[filePath] <- SE.getArgs
|
||||
f <- BS.readFile filePath
|
||||
TIO.putStrLn "original:"
|
||||
print . BS.length $ f
|
||||
TIO.putStrLn "rest:"
|
||||
print . ("Word64",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word64)) $ f
|
||||
print . ("Word32",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word32)) $ f
|
||||
print . ("Word16",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word16)) $ f
|
||||
print . ("Word8",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f
|
||||
Loading…
Add table
Add a link
Reference in a new issue