From 874d0b161df1a80f0de419b2676a6ab8ba9639ba Mon Sep 17 00:00:00 2001 From: Jack Wines Date: Sat, 13 Apr 2024 02:13:24 -0700 Subject: [PATCH] compression works Word16 seems to give the best compression ratio. It takes a very long time to run on >5MB files. Decompression is not implemented yet, but I'm confident it works. --- .envrc | 1 + .gitignore | 4 ++ compress.cabal | 53 ++++++++++++++++ flake.lock | 117 +++++++++++++++++++++++++++++++++++ flake.nix | 84 ++++++++++++++++++++++++++ src/Data/SortedList.hs | 27 +++++++++ src/Main.hs | 134 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 420 insertions(+) create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 compress.cabal create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 src/Data/SortedList.hs create mode 100644 src/Main.hs diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..44610e5 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake; diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..39c1eec --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/dist-newstyle/ +/result +/.direnv/ +**/.DS_Store diff --git a/compress.cabal b/compress.cabal new file mode 100644 index 0000000..84365e1 --- /dev/null +++ b/compress.cabal @@ -0,0 +1,53 @@ +cabal-version: 3.0 +name: compress +version: 0.1.0.0 +category: Web +build-type: Simple + +executable compress + main-is: + Main.hs + default-extensions: + DataKinds, + DeriveAnyClass, + DeriveGeneric, + DuplicateRecordFields, + ExtendedDefaultRules, + FlexibleContexts, + FlexibleInstances, + ImpredicativeTypes, + InstanceSigs, + MultiParamTypeClasses, + NamedFieldPuns, + OverloadedLabels, + OverloadedLists, + OverloadedStrings, + PartialTypeSignatures, + RankNTypes, + RecordWildCards, + RecursiveDo, + ScopedTypeVariables, + StandaloneDeriving, + StrictData, + TemplateHaskell, + TupleSections, + TypeApplications, + TypeFamilies, + TypeOperators, + TypeSynonymInstances, + UndecidableInstances, + hs-source-dirs: + src + build-depends: + base, + bitvec, + bytestring, + cereal, + containers, + text, + uuid, + vector + default-language: + GHC2021 + other-modules: + Data.SortedList diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..3f1a99b --- /dev/null +++ b/flake.lock @@ -0,0 +1,117 @@ +{ + "nodes": { + "flake-parts": { + "inputs": { + "nixpkgs-lib": "nixpkgs-lib" + }, + "locked": { + "lastModified": 1709336216, + "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=", + "owner": "hercules-ci", + "repo": "flake-parts", + "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "flake-parts", + "type": "github" + } + }, + "haskell-flake": { + "locked": { + "lastModified": 1711149116, + "narHash": "sha256-tccTtjRvxrhSJkCnmNwaPrq0DDM3UsM0uiDyW4uJXXc=", + "owner": "srid", + "repo": "haskell-flake", + "rev": "6ae8a85071adfe08d70d9963c526947403c6c070", + "type": "github" + }, + "original": { + "owner": "srid", + "repo": "haskell-flake", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1711163522, + "narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs-lib": { + "locked": { + "dir": "lib", + "lastModified": 1709237383, + "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", + "type": "github" + }, + "original": { + "dir": "lib", + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-parts": "flake-parts", + "haskell-flake": "haskell-flake", + "nixpkgs": "nixpkgs", + "systems": "systems", + "treefmt-nix": "treefmt-nix" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "treefmt-nix": { + "inputs": { + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1710781103, + "narHash": "sha256-nehQK/XTFxfa6rYKtbi8M1w+IU1v5twYhiyA4dg1vpg=", + "owner": "numtide", + "repo": "treefmt-nix", + "rev": "7ee5aaac63c30d3c97a8c56efe89f3b2aa9ae564", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "treefmt-nix", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..df8cd9b --- /dev/null +++ b/flake.nix @@ -0,0 +1,84 @@ +{ + description = "srid/haskell-template: Nix template for Haskell projects"; + inputs = { + nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; + systems.url = "github:nix-systems/default"; + flake-parts.url = "github:hercules-ci/flake-parts"; + haskell-flake.url = "github:srid/haskell-flake"; + treefmt-nix.url = "github:numtide/treefmt-nix"; + treefmt-nix.inputs.nixpkgs.follows = "nixpkgs"; + }; + + outputs = inputs: + inputs.flake-parts.lib.mkFlake { inherit inputs; } { + systems = import inputs.systems; + imports = [ + inputs.haskell-flake.flakeModule + inputs.treefmt-nix.flakeModule + ]; + perSystem = { self', system, lib, config, pkgs, ... }: { + # Our only Haskell project. You can have multiple projects, but this template + # has only one. + # See https://github.com/srid/haskell-flake/blob/master/example/flake.nix + haskellProjects.default = { + # The base package set (this value is the default) + # basePackages = pkgs.haskellPackages; + + # Packages to add on top of `basePackages` + packages = { + # Add source or Hackage overrides here + # (Local packages are added automatically) + # https://github.com/lehins/hip.git + # hip.source = inputs.hip + "/hip"; + }; + + # Add your package overrides here + settings = { + # barbies-th = { + # broken = false; + # jailbreak = true; + # }; + }; + + # Development shell configuration + devShell = { + hlsCheck.enable = false; + }; + + # What should haskell-flake add to flake outputs? + autoWire = [ "packages" "apps" "checks" ]; # Wire all but the devShell + }; + + # Auto formatters. This also adds a flake check to ensure that the + # source tree was auto formatted. + treefmt.config = { + projectRootFile = "flake.nix"; + + programs.ormolu.enable = true; + programs.nixpkgs-fmt.enable = true; + programs.cabal-fmt.enable = true; + programs.hlint.enable = true; + + # We use fourmolu + programs.ormolu.package = pkgs.haskellPackages.fourmolu; + }; + + # Default package & app. + packages.default = self'.packages.compress; + apps.default = self'.apps.compress; + + # Default shell. + devShells.default = pkgs.mkShell { + name = "haskell-template"; + meta.description = "Haskell development environment"; + # See https://zero-to-flakes.com/haskell-flake/devshell#composing-devshells + inputsFrom = [ + config.haskellProjects.default.outputs.devShell + config.treefmt.build.devShell + ]; + nativeBuildInputs = with pkgs; [ + ]; + }; + }; + }; +} diff --git a/src/Data/SortedList.hs b/src/Data/SortedList.hs new file mode 100644 index 0000000..0d2d601 --- /dev/null +++ b/src/Data/SortedList.hs @@ -0,0 +1,27 @@ +module Data.SortedList where +import qualified Data.List as L + +data SortedList a = Ord a => SortedList { + toList :: [a] +} + +deriving instance Show a => Show (SortedList a) + +map :: Ord a1 => ([a2] -> [a1]) -> SortedList a2 -> SortedList a1 +map f (SortedList a) = SortedList . f $ a + +fromList :: Ord a => [a] -> SortedList a +fromList = SortedList . L.sort + +singleton :: Ord a => a -> SortedList a +singleton a = SortedList [a] + +minView :: SortedList a -> Maybe (a, SortedList a) +minView (SortedList (x:xs)) = Just (x, SortedList xs) +minView (SortedList _) = Nothing + +insert :: p -> SortedList p -> SortedList p +insert x (SortedList []) = singleton x +insert x (SortedList (x' : xs)) + | x <= x' = SortedList (x : x' : xs) + | otherwise = Data.SortedList.map (x' :) . insert x . SortedList $ xs diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..3ac3f27 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,134 @@ +module Main where + +import Data.Bifunctor qualified as Bi +import Data.Bit qualified as B +import Data.Bit qualified as BV +import Data.Bits (Bits ((.|.))) +import Data.Bits qualified as B +import Data.ByteString (fromFilePath) +import Data.ByteString qualified as BS +import Data.Foldable qualified as F +import Data.IntMap.Strict qualified as IM +import Data.Map.Strict qualified as M +import Data.Maybe (fromMaybe) +import Data.Maybe qualified as My +import Data.Serialize qualified as C +import Data.SortedList qualified as SL +import Data.Text qualified as T +import Data.Text.IO qualified as TIO +import Data.Vector.Unboxed qualified as V +import Data.Word +import GHC.Generics (Generic) +import System.Environment qualified as SE + +data HuffmanTree a + = Leaf a + | Node + { left :: HuffmanTree a + , right :: HuffmanTree a + } + deriving (Eq, Ord, Show, Generic, C.Serialize) + +data SizedHuffmanTree a = SizedHuffmanTree + { size :: Int + , tree :: HuffmanTree a + } + deriving (Eq, Ord, Show) + +data TreeDir = L | R deriving (Eq, Ord, Show, Generic, C.Serialize) + +findTreeDirections :: forall a. (Ord a) => HuffmanTree a -> M.Map a [TreeDir] +findTreeDirections (Leaf a) = M.singleton a [] +findTreeDirections (Node{..}) = M.union (rec' L left) (rec' R right) + where + rec' :: TreeDir -> HuffmanTree a -> M.Map a [TreeDir] + rec' dir = M.map (dir :) . findTreeDirections + +mergeSizedTrees tree0 tree1 = + SizedHuffmanTree + { size = (size tree0) + (size tree1) + , tree = Node (tree tree0) (tree tree1) + } + +-- compress :: BS.ByteString -> Maybe (SizedHuffmanTree BS.ByteString) +compress :: forall a. (Ord a, Integral a, B.FiniteBits a) => BS.ByteString -> Maybe ([TreeDir], HuffmanTree a) +compress bs = + liftA2 (,) treeDirections unsizedTree + where + unsizedTree :: Maybe (HuffmanTree a) + unsizedTree = tree <$> mergedHuffmanTrees + + treeDirections = concat <$> mapM (treeDirMap M.!?) dividedByteString + + mergedHuffmanTrees = + mergeHuffmanTrees + . SL.fromList + . map (uncurry (flip SizedHuffmanTree) . Bi.first Leaf) + . counts + $ dividedByteString + + treeDirMap :: M.Map a [TreeDir] + treeDirMap = My.maybe M.empty (findTreeDirections . tree) mergedHuffmanTrees + + dividedByteString = toBitsList bs + +encodeCompressed :: C.Serialize a => Maybe ([TreeDir], HuffmanTree a) -> BS.ByteString +encodeCompressed = C.encode . fmap (Bi.first encodeTreeDirs) + where + encodeTreeDirs = BV.cloneToByteString . V.fromList . map (BV.Bit . (== R)) + +mergeHuffmanTrees :: SL.SortedList (SizedHuffmanTree a) -> Maybe (SizedHuffmanTree a) +mergeHuffmanTrees (SL.SortedList (tree0 : tree1 : xs)) = + mergeHuffmanTrees + . SL.insert (mergeSizedTrees tree0 tree1) + . SL.SortedList + $ xs + +mergeHuffmanTrees (SL.SortedList [x]) = Just x +mergeHuffmanTrees (SL.SortedList []) = Nothing + +counts :: (Ord a) => [a] -> [(a, Int)] +counts = M.toList . F.foldl' combiningInsert M.empty + where + combiningInsert m key = M.insertWith (+) key 1 m + +divideByteString :: Int -> BS.ByteString -> [BS.ByteString] +divideByteString n [] = [] +divideByteString n bs = x : divideByteString n xs + where + (x, xs) = BS.splitAt n bs + +toBitsList :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> [a] +toBitsList bs = case finiteBitUncons bs of + Nothing -> [] + (Just (x, xs)) -> x : (toBitsList xs) + +finiteBitUncons :: forall a. (Integral a, B.FiniteBits a) => BS.ByteString -> Maybe (a, BS.ByteString) +finiteBitUncons [] = Nothing +finiteBitUncons bs = + Just + . (,rest) + . F.foldl' (.|.) 0 + . zipWith (flip B.shiftL) [0, 8 ..] + . reverse + . map (fromIntegral :: Word8 -> a) + . BS.unpack + $ takenBytes + where + numBytes = (`div` 8) . B.finiteBitSize $ (B.zeroBits :: a) + + takenBytes :: BS.ByteString + rest :: BS.ByteString + (takenBytes, rest) = BS.splitAt numBytes bs + +main :: IO () +main = do + [filePath] <- SE.getArgs + f <- BS.readFile filePath + TIO.putStrLn "original:" + print . BS.length $ f + TIO.putStrLn "rest:" + print . ("Word64",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word64)) $ f + print . ("Word32",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word32)) $ f + print . ("Word16",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word16)) $ f + print . ("Word8",) . BS.length . encodeCompressed . (compress :: BS.ByteString -> Maybe ([TreeDir], HuffmanTree Word8)) $ f