diff --git a/compress.cabal b/compress.cabal index 8939f3c..2b7cb2c 100644 --- a/compress.cabal +++ b/compress.cabal @@ -53,7 +53,9 @@ executable compress optparse-generic, vector, nonempty-containers, - primes + witch, + monad-par, + monad-par-extras default-language: GHC2021 other-modules: @@ -63,8 +65,9 @@ executable compress Compress.PrefixTree Data.HuffmanTree Compress.Arithmetic + Data.WordyMap ghc-options: -threaded -fprof-auto -fprof-late - "-with-rtsopts=-p -hc" + "-with-rtsopts=-p -hc -B -N -qa" diff --git a/src/Compress/Arithmetic.hs b/src/Compress/Arithmetic.hs index bf0f6ec..5daae19 100644 --- a/src/Compress/Arithmetic.hs +++ b/src/Compress/Arithmetic.hs @@ -12,12 +12,13 @@ import Data.List (genericLength) import Data.List qualified as L import Data.Map.Strict qualified as M import Data.Maybe qualified as My -import Data.Numbers.Primes import Data.Ord import Data.Ratio import Data.Serialize qualified as C import GHC.Generics (Generic) import GHC.Natural (Natural) +import qualified Control.Monad.Par as P +import Control.Monad.Par (NFData) -- withPrimeDenums = concatMap (\x -> map (% x) [1 .. (pred x)]) primes @@ -30,21 +31,32 @@ data WordMarkovStats = WordMarkovStats } deriving (Show, Eq, Ord, Generic, C.Serialize) -toRing :: forall a k. (Ord k, Integral a, Bounded a) => M.Map k a -> M.Map k WordMarkovStats -toRing m = M.fromList . zip (map fst asList) $ zipWith WordMarkovStats (scanl (+) 0 . map snd $ asList) (map snd asList) +toRing :: forall a k. (Ord k, Integral a, Bounded a) => [(k, a)] -> M.Map k WordMarkovStats +toRing xs = M.fromList . zip (map fst xs) $ wordMarkovStats where - asList = M.toList $ m' - sum' = - sum - . map fromIntegral - . M.elems - $ m - m' = M.map ((% sum') . fromIntegral) m + + sizes = map ((% maxBound') . fromIntegral . snd) xs + + wordMarkovStats = zipWith WordMarkovStats (L.scanl' (+) 0 withBumpedZeroSized) withBumpedZeroSized + + asFracsOfTotalSum = map fst . tail . L.scanl' f (0, 0) $ sizes maxBound' :: Integer maxBound' = fromIntegral (maxBound :: a) -twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k (M.Map k b) + withBumpedZeroSized + | numZeroSized /= 0 = map (max (remainingSpace / numZeroSized)) asFracsOfTotalSum + | otherwise = asFracsOfTotalSum + + remainingSpace = 1 - sum asFracsOfTotalSum + + numZeroSized = L.genericLength . filter (== 0) $ asFracsOfTotalSum + + f (prevFrac, runningSum) currFrac = (newFrac, newFrac + runningSum) + where + newFrac = currFrac * (1 - runningSum) + +twoByteMarkov :: forall k b. (Num b, Integral b, Bounded b, Ord k) => [k] -> M.Map k [(k, b)] twoByteMarkov xs = M.map sizeAsFraction . M.fromListWith (M.unionWith (+)) @@ -53,69 +65,115 @@ twoByteMarkov xs = . tail $ xs where - sizeAsFraction m = M.map (max 1 . floor . fromRational . (* maxBound') . (% sum')) m + -- optimization, changes fractions so they represent the fraction + -- of remaining space in the list taken up instead of total space + sizeAsFraction :: M.Map k Integer -> [(k, b)] + sizeAsFraction m = + zip keys + . map discretizeFraction + . fractionOfRemainingSums + $ counts where - sum' = sum . M.elems $ m + fractionOfRemainingSums xs = zipWith (%) xs . scanr (+) 0 $ xs - toInteger :: (Integral a) => a -> Integer - toInteger = fromIntegral + asList = L.sortOn (negate . snd) . M.assocs $ m + keys = map fst asList + + counts = map snd asList + + discretizeFraction :: Rational -> b + discretizeFraction = floor . fromRational . (* maxBound') + + maxBound' :: Ratio Integer maxBound' = fromIntegral (maxBound :: b) data Compressed a = Compressed - { markovs :: M.Map a (M.Map a (Word8)), + { markovs :: M.Map a [(a, Word8)], + chunks :: [Chunk a] + } + deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData) + +data Chunk a = Chunk { location :: Ratio Integer, start :: a, length :: Int - } - deriving (Eq, Ord, Show, Generic, C.Serialize) + } deriving (Eq, Ord, Show, Generic, C.Serialize, P.NFData) decompress :: forall a. - (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => + (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) => Compressed a -> [a] -decompress (Compressed {..}) = take length $ map snd . L.iterate decompress' $ (location, start) +decompress (Compressed {..}) = concat . P.runPar . P.parMap decompressChunk $ chunks where - decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) - decompress' (loc, prev) = ((loc - ansLoc) / newSize, newVal) - where - (ansLoc, (newVal, newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev) rings = M.map (M.fromList . map toDecompressionRing . M.toList . toRing) markovs toDecompressionRing (key, (WordMarkovStats {..})) = (location, (key, size)) + decompressChunk :: Chunk a -> [a] + decompressChunk (Chunk {..}) = map snd . take length . L.iterate' decompress' $ (location, start) + where + decompress' :: (Ratio Integer, a) -> (Ratio Integer, a) + decompress' (!loc, !prev) = ((loc - ansLoc) / newSize, newVal) + where + ansLoc :: Ratio Integer + newVal :: a + newSize :: Ratio Integer + (!ansLoc, (!newVal, !newSize)) = My.fromJust . M.lookupLE loc $ (rings M.! prev) + sanityCheck :: forall k. (Ord k) => M.Map k (M.Map k Word8) -> [Word8] sanityCheck = map (sum . M.elems) . M.elems +chunk chunkSize = chunk' + where + chunk' [] = [] + chunk' xs = xs' : chunk' xs'' + where + (xs', xs'') = splitAt chunkSize xs + +chunkLength = 8000 + compress :: forall a. - (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a) => + (Integral a, B.FiniteBitsOps a, B.BitOps a, Show a, NFData a) => [a] -> Compressed a -compress toCompress = Compressed twoByteMarkovs (shortestLocation endStats) (head toCompress) (genericLength toCompress) +compress toCompress = Compressed twoByteMarkovs . P.runPar . P.parMap compressChunk $ unCompressedChunks where + + unCompressedChunks = chunk chunkLength toCompress + twoByteMarkovs = twoByteMarkov toCompress rings = M.map toRing twoByteMarkovs - pairs = zip toCompress . tail $ toCompress + compressChunk toCompress = Chunk { + location = shortestLocation endStats, + start = head toCompress, + length = L.length toCompress + } + where - shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size) + pairs = zip toCompress . tail $ toCompress - endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs + shortestLocation (WordMarkovStats {..}) = simplestBetween location (location + size) - addWordMarkovStats - (WordMarkovStats {location = prevLoc, size = prevSize}) - (WordMarkovStats {location = nextLoc, size = nextSize}) = - WordMarkovStats - { location = prevLoc + (prevSize * nextLoc), - size = prevSize * nextSize - } + endStats = pyramidFold addWordMarkovStats . map statsFor $ pairs - statsFor (x0, x1) = (rings M.! x0) M.! x1 + statsFor (x0, x1) = (rings M.! x0) M.! x1 + addWordMarkovStats + (WordMarkovStats {location = prevLoc, size = prevSize}) + (WordMarkovStats {location = nextLoc, size = nextSize}) = + WordMarkovStats + { location = prevLoc + (prevSize * nextLoc), + size = prevSize * nextSize + } + + +pyramidFold :: (a -> a -> a) -> [a] -> a pyramidFold f = pyramid where pyramid [x] = x diff --git a/src/Main.hs b/src/Main.hs index a019f57..cbb5cf0 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -52,7 +52,12 @@ main :: IO () main = do -- cliOpts :: CLIOpts <- O.getRecord "compress/decompress & strategy" f <- BS.getContents - BS.putStr . C.encode . A.compress . (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f + let fAsWords = (FB.toWordsList :: BS.ByteString -> [ Word8 ]) $ f + let compressedUnencoded :: A.Compressed Word8 = (A.compress fAsWords) + let compressed = C.encode compressedUnencoded + print ("compression ratio", (fromIntegral . BS.length $ compressed) / (fromIntegral . BS.length $ f)) + print ("works?", (A.decompress compressedUnencoded) == fAsWords) + -- let f = "hello tehre" -- f <- BS.readFile "pg64317.txt" -- let (compressed :: Maybe (TreeDirs, PT.HuffmanPrefixTree Word8 Word8, Word8)) = PT.compress f