compression works

Word16 seems to give the best compression ratio. It takes a very long
time to run on >5MB files. Decompression is not implemented yet, but I'm
confident it works.
This commit is contained in:
Jack Wines 2024-04-13 02:13:24 -07:00
commit 874d0b161d
No known key found for this signature in database
GPG key ID: 25B20640600571E6
7 changed files with 420 additions and 0 deletions

1
.envrc Normal file
View file

@ -0,0 +1 @@
use flake;

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
/dist-newstyle/
/result
/.direnv/
**/.DS_Store

53
compress.cabal Normal file
View file

@ -0,0 +1,53 @@
cabal-version: 3.0
name: compress
version: 0.1.0.0
category: Web
build-type: Simple
executable compress
main-is:
Main.hs
default-extensions:
DataKinds,
DeriveAnyClass,
DeriveGeneric,
DuplicateRecordFields,
ExtendedDefaultRules,
FlexibleContexts,
FlexibleInstances,
ImpredicativeTypes,
InstanceSigs,
MultiParamTypeClasses,
NamedFieldPuns,
OverloadedLabels,
OverloadedLists,
OverloadedStrings,
PartialTypeSignatures,
RankNTypes,
RecordWildCards,
RecursiveDo,
ScopedTypeVariables,
StandaloneDeriving,
StrictData,
TemplateHaskell,
TupleSections,
TypeApplications,
TypeFamilies,
TypeOperators,
TypeSynonymInstances,
UndecidableInstances,
hs-source-dirs:
src
build-depends:
base,
bitvec,
bytestring,
cereal,
containers,
text,
uuid,
vector
default-language:
GHC2021
other-modules:
Data.SortedList

117
flake.lock generated Normal file
View file

@ -0,0 +1,117 @@
{
"nodes": {
"flake-parts": {
"inputs": {
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1709336216,
"narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
"type": "github"
},
"original": {
"owner": "hercules-ci",
"repo": "flake-parts",
"type": "github"
}
},
"haskell-flake": {
"locked": {
"lastModified": 1711149116,
"narHash": "sha256-tccTtjRvxrhSJkCnmNwaPrq0DDM3UsM0uiDyW4uJXXc=",
"owner": "srid",
"repo": "haskell-flake",
"rev": "6ae8a85071adfe08d70d9963c526947403c6c070",
"type": "github"
},
"original": {
"owner": "srid",
"repo": "haskell-flake",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1711163522,
"narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4",
"type": "github"
},
"original": {
"owner": "nixos",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs-lib": {
"locked": {
"dir": "lib",
"lastModified": 1709237383,
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
"type": "github"
},
"original": {
"dir": "lib",
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-parts": "flake-parts",
"haskell-flake": "haskell-flake",
"nixpkgs": "nixpkgs",
"systems": "systems",
"treefmt-nix": "treefmt-nix"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"treefmt-nix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1710781103,
"narHash": "sha256-nehQK/XTFxfa6rYKtbi8M1w+IU1v5twYhiyA4dg1vpg=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "7ee5aaac63c30d3c97a8c56efe89f3b2aa9ae564",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "treefmt-nix",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

84
flake.nix Normal file
View file

@ -0,0 +1,84 @@
{
description = "srid/haskell-template: Nix template for Haskell projects";
inputs = {
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
systems.url = "github:nix-systems/default";
flake-parts.url = "github:hercules-ci/flake-parts";
haskell-flake.url = "github:srid/haskell-flake";
treefmt-nix.url = "github:numtide/treefmt-nix";
treefmt-nix.inputs.nixpkgs.follows = "nixpkgs";
};
outputs = inputs:
inputs.flake-parts.lib.mkFlake { inherit inputs; } {
systems = import inputs.systems;
imports = [
inputs.haskell-flake.flakeModule
inputs.treefmt-nix.flakeModule
];
perSystem = { self', system, lib, config, pkgs, ... }: {
# Our only Haskell project. You can have multiple projects, but this template
# has only one.
# See https://github.com/srid/haskell-flake/blob/master/example/flake.nix
haskellProjects.default = {
# The base package set (this value is the default)
# basePackages = pkgs.haskellPackages;
# Packages to add on top of `basePackages`
packages = {
# Add source or Hackage overrides here
# (Local packages are added automatically)
# https://github.com/lehins/hip.git
# hip.source = inputs.hip + "/hip";
};
# Add your package overrides here
settings = {
# barbies-th = {
# broken = false;
# jailbreak = true;
# };
};
# Development shell configuration
devShell = {
hlsCheck.enable = false;
};
# What should haskell-flake add to flake outputs?
autoWire = [ "packages" "apps" "checks" ]; # Wire all but the devShell
};
# Auto formatters. This also adds a flake check to ensure that the
# source tree was auto formatted.
treefmt.config = {
projectRootFile = "flake.nix";
programs.ormolu.enable = true;
programs.nixpkgs-fmt.enable = true;
programs.cabal-fmt.enable = true;
programs.hlint.enable = true;
# We use fourmolu
programs.ormolu.package = pkgs.haskellPackages.fourmolu;
};
# Default package & app.
packages.default = self'.packages.compress;
apps.default = self'.apps.compress;
# Default shell.
devShells.default = pkgs.mkShell {
name = "haskell-template";
meta.description = "Haskell development environment";
# See https://zero-to-flakes.com/haskell-flake/devshell#composing-devshells
inputsFrom = [
config.haskellProjects.default.outputs.devShell
config.treefmt.build.devShell
];
nativeBuildInputs = with pkgs; [
];
};
};
};
}

27
src/Data/SortedList.hs Normal file
View file

@ -0,0 +1,27 @@
module Data.SortedList where
import qualified Data.List as L
data SortedList a = Ord a => SortedList {
toList :: [a]
}
deriving instance Show a => Show (SortedList a)
map :: Ord a1 => ([a2] -> [a1]) -> SortedList a2 -> SortedList a1
map f (SortedList a) = SortedList . f $ a
fromList :: Ord a => [a] -> SortedList a
fromList = SortedList . L.sort
singleton :: Ord a => a -> SortedList a
singleton a = SortedList [a]
minView :: SortedList a -> Maybe (a, SortedList a)
minView (SortedList (x:xs)) = Just (x, SortedList xs)
minView (SortedList _) = Nothing
insert :: p -> SortedList p -> SortedList p
insert x (SortedList []) = singleton x
insert x (SortedList (x' : xs))
| x <= x' = SortedList (x : x' : xs)
| otherwise = Data.SortedList.map (x' :) . insert x . SortedList $ xs

134
src/Main.hs Normal file
View file

@ -0,0 +1,134 @@
module Main where
import Data.Bifunctor qualified as Bi
import Data.Bit qualified as B
import Data.Bit qualified as BV
import Data.Bits (Bits ((.|.)))
import Data.Bits qualified as B
import Data.ByteString (fromFilePath)
import Data.ByteString qualified as BS
import Data.Foldable qualified as F
import Data.IntMap.Strict qualified as IM
import Data.Map.Strict qualified as M
import Data.Maybe (fromMaybe)
import Data.Maybe qualified as My
import Data.Serialize qualified as C
import Data.SortedList qualified as SL
import Data.Text qualified as T
import Data.Text.IO qualified as TIO
import Data.Vector.Unboxed qualified as V
import Data.Word
import GHC.Generics (Generic)
import System.Environment qualified as SE
data HuffmanTree a
= Leaf a
| Node
{ left :: HuffmanTree a
, right :: HuffmanTree a
}
deriving (Eq, Ord, Show, Generic, C.Serialize)
data SizedHuffmanTree a = SizedHuffmanTree
{ size :: Int
, tree :: HuffmanTree a
}
deriving (Eq, Ord, Show)
data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize)
findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir]
findTreeDirections (Leaf a) = M.singleton a []
findTreeDirections (Node{..}) = M.union (rec' L left) (rec' R right)
where
rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir]
rec' dir = M.map (dir :) . findTreeDirections
mergeSizedTrees tree0 tree1 =
SizedHuffmanTree
{ size = (size tree0) + (size tree1)
, tree = Node (tree tree0) (tree tree1)
}
-- compress :: BS.ByteString -> Maybe (SizedHuffmanTree BS.ByteString)
compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a)
compress bs =
liftA2 (,) treeDirections unsizedTree
where
unsizedTree :: Maybe (HuffmanTree a)
unsizedTree = tree <$> mergedHuffmanTrees
treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString
mergedHuffmanTrees =
mergeHuffmanTrees
. SL.fromList
. map (uncurry (flip SizedHuffmanTree) . Bi.first Leaf)
. counts
$ dividedByteString
treeDirMap :: M.Map a [TreeDir]
treeDirMap = My.maybe M.empty (findTreeDirections . tree) mergedHuffmanTrees
dividedByteString = toBitsList bs
encodeCompressed :: C.Serialize a => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString
encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs)
where
encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R))
mergeHuffmanTrees :: SL.SortedList (SizedHuffmanTree a) -> Maybe (SizedHuffmanTree a)
mergeHuffmanTrees (SL.SortedList (tree0 : tree1 : xs)) =
mergeHuffmanTrees
. SL.insert (mergeSizedTrees tree0 tree1)
. SL.SortedList
$ xs
mergeHuffmanTrees (SL.SortedList [x]) = Just x
mergeHuffmanTrees (SL.SortedList []) = Nothing
counts :: (Ord a) => [a] -> [(a, Int)]
counts = M.toList . F.foldl' combiningInsert M.empty
where
combiningInsert m key = M.insertWith (+) key 1 m
divideByteString :: Int -> BS.ByteString -> [BS.ByteString]
divideByteString n [] = []
divideByteString n bs = x : divideByteString n xs
where
(x, xs) = BS.splitAt n bs
toBitsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a]
toBitsList bs = case finiteBitUncons bs of
Nothing -> []
(Just (x, xs)) -> x : (toBitsList xs)
finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString)
finiteBitUncons [] = Nothing
finiteBitUncons bs =
Just
. (,rest)
. F.foldl' (.|.) 0
. zipWith (flip B.shiftL) [0, 8 ..]
. reverse
. map (fromIntegral :: Word8 -> a)
. BS.unpack
$ takenBytes
where
numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a)
takenBytes :: BS.ByteString
rest :: BS.ByteString
(takenBytes, rest) = BS.splitAt numBytes bs
main :: IO ()
main = do
[filePath] <- SE.getArgs
f <- BS.readFile filePath
TIO.putStrLn "original:"
print . BS.length $ f
TIO.putStrLn "rest:"
print . ("Word64",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word64)) $ f
print . ("Word32",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word32)) $ f
print . ("Word16",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word16)) $ f
print . ("Word8",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f